示例#1
0
    def comm_info(self, comm_url_list):
        for comm_url in comm_url_list:
            try:
                url = "http://as.gzfcxx.cn" + comm_url
                res = requests.get(url, headers=self.headers)
                co = Comm(co_index)
                co.co_name = re.search('项目名称.*?ck">(.*?)<', res.text,
                                       re.S | re.M).group(1)
                co.co_id = re.search('yszh=(\d+)', comm_url).group(1)
                co.co_develops = re.search('开发商.*?ck">(.*?)<', res.text,
                                           re.S | re.M).group(1)
                co.co_address = re.search('坐落.*?ck">(.*?)<', res.text,
                                          re.S | re.M).group(1)
                co.co_pre_sale = re.search('许可证.*?ck">(.*?)<', res.text,
                                           re.S | re.M).group(1)
                co.co_handed_time = re.search('交房时间.*?ck">(.*?)<', res.text,
                                              re.S | re.M).group(1)
                co.insert_db()

                html = etree.HTML(res.text)
                build_detail = html.xpath("//a[@class='a3']/@href")[0]
            except Exception as e:
                log.error('小区信息错误', e)
                continue
            self.build_info(build_detail, co.co_id)
示例#2
0
    def start_crawler(self):
        data = {
            "Submit":"(unable to decode value)"
        }
        res = requests.post(self.start_url,data=data,headers=self.headers)
        html = etree.HTML(res.content.decode('gbk'))
        comm_url_list = html.xpath("//tr//span[@style='width:270px; color:#006']//a/@href")
        for comm_url in comm_url_list:
            try:
                url = 'http://www.fxfdcw.com/' + comm_url
                com_res = requests.get(url,headers=self.headers)
                con = com_res.content.decode('gbk')
                co = Comm(co_index)
                co.co_id = re.search('xmid=(\d+)',comm_url).group(1)
                co.co_name =  re.search('项目名称.*?">(.*?)</',con,re.S|re.M).group(1)
                co.co_develops = re.search('开发企业:(.*?) &nbsp',con,re.S|re.M).group(1)
                co.co_address = re.search('项目地址.*?">(.*?)</',con,re.S|re.M).group(1)
                co.co_build_size = re.search('建筑面积.*?">(.*?)</',con,re.S|re.M).group(1)
                co.co_all_house = re.search('总套数.*?">(.*?)</',con,re.S|re.M).group(1)
                co.insert_db()

                bu_list = re.findall("window.open\('(.*?)'\)",con,re.S|re.M)
            except Exception as e:
                # log.error("小区信息错误{}".format(e))
                print("小区信息错误{}".format(e))
                continue

            self.bu_info(bu_list,co.co_id)
示例#3
0
 def baiyin_start(self):
     page = self.get_all_page()
     print(page)
     for i in range(1, int(page) + 1):
         res = requests.get(self.url + '?page=' + str(i),
                            headers=self.headers)
         html = res.content.decode('gbk')
         tree = etree.HTML(html)
         community_list = tree.xpath('//tr[@align="center"]')
         for i in community_list[1:]:
             try:
                 comm = Comm(self.CO_INDEX)
                 href = i.xpath('td/a/@href')
                 area = i.xpath('td[1]/text()')
                 if not area:
                     area = None
                 else:
                     area = area[0]
                 href = href[0]
                 comm.area = area
                 self.get_comm_detail(href, comm)
             except Exception as e:
                 href = i.xpath('td/a/@href')
                 if not href:
                     continue
                 href = href[0]
                 comm_url = self.URL_FRONT + href
                 print('小区错误:', comm_url)
                 print(e)
示例#4
0
 def start_crawler(self):
     res = requests.get(url, headers=self.headers)
     content = res.text
     page = re.search('页数:1/(.*?) ', content, re.S | re.M).group(1)
     for i in range(1, int(page) + 1):
         page_url = 'http://newhouse.ntfdc.net/house_certification.aspx?p=' + str(
             i)
         response = requests.get(page_url, headers=self.headers)
         html = response.text
         comm_html = re.search('class="layer-bd tb-style1">.*?</table>',
                               html, re.S | re.M).group()
         comm_info_list = re.findall('<tr>.*?</tr>', comm_html,
                                     re.S | re.M)[1:]
         for info in comm_info_list:
             try:
                 comm = Comm(co_index)
                 comm.co_pre_sale = re.search('<td.*?>(.*?)<', info,
                                              re.S | re.M).group(1)
                 comm.co_name = re.search('<td.*?<td.*?>(.*?)<', info,
                                          re.S | re.M).group(1)
                 comm.co_all_size = re.search('<td.*?<td.*?<td.*?>(.*?)<',
                                              info, re.S | re.M).group(1)
                 comm.co_type = re.search(
                     '<td.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', info,
                     re.S | re.M).group(1)
                 comm.co_pre_sale_date = re.search(
                     '<td.*?<td.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', info,
                     re.S | re.M).group(1)
                 comm.co_develops = re.search(
                     '<td.*?<td.*?<td.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<',
                     info, re.S | re.M).group(1)
                 comm.insert_db()
             except Exception as e:
                 print('小区错误,co_index={},url={}'.format(co_index, page_url),
                       e)
示例#5
0
    def start_crawler(self):
        res = requests.get(self.start_url, headers=self.headers)
        html = etree.HTML(res.text)
        comm_url_list = html.xpath("//div[@class='post']//a/@href")
        for comm_url in comm_url_list:
            try:
                url = 'http://www.ggsfcw.com/' + comm_url
                comm_res = requests.get(url, headers=self.headers)
                com_html = etree.HTML(comm_res.text)
                comm = Comm(co_index)
                comm.co_name = re.search('<h3.*?">(.*?)</',
                                         comm_res.text).group(1)
                comm.co_id = re.search('n=(\d+)', comm_res.text).group(1)
                comm.co_address = re.search('地址.*?">(.*?)</',
                                            comm_res.text).group(1)
                comm.area = re.search('区县.*?">(.*?)</', comm_res.text).group(1)
                comm.co_develops = re.search('开发商.*?">(.*?)</',
                                             comm_res.text).group(1)
                comm.co_use = re.search('规划用途.*?">(.*?)</',
                                        comm_res.text).group(1)
                comm.insert_db()
            except Exception as e:
                log.error("小区信息错误", e)
                continue

            bu_list = com_html.xpath("//div[@id='MainContent_divResult']/a")
            self.build_info(bu_list, comm.co_id)
示例#6
0
 def get_comm_info(self, comm_html_list):
     for i in comm_html_list:
         comm = Comm(co_index)
         comm.co_name = re.search('<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1)
         comm.co_develops = re.search('<td.*?><a.*?>(.*?)<', i, re.S | re.M).group(1)
         comm.co_address = re.search('<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1)
         detail_url = re.search('href="(.*?)"', i, re.S | re.M).group(1)
         self.get_comm_detail(detail_url, comm)
示例#7
0
 def get_comm_detail(self, comm_list):
     for i in comm_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://house.bffdc.gov.cn/public/project/' + i
             response = requests.get(comm_url)
             html = response.text
             comm.co_name = re.search('PROJECT_XMMC">(.*?)<', html,
                                      re.S | re.M).group(1)
             comm.co_develops = re.search('PROJECT_KFQY_NAME">(.*?)<', html,
                                          re.S | re.M).group(1)
             comm.co_address = re.search('PROJECT_XMDZ">(.*?)<', html,
                                         re.S | re.M).group(1)
             comm.area = re.search('PROJECT_SZQY">(.*?)<', html,
                                   re.S | re.M).group(1)
             comm.co_pre_sale = re.search('YSXKZH">(.*?)<', html,
                                          re.S | re.M).group(1)
             comm.insert_db()
             build_info = re.search('id="buildInfo".*?value="(.*?)"', html,
                                    re.S | re.M).group(1)
             build_url_list = build_info.split(';;')
             self.get_build_info(build_url_list, comm.co_name)
             global count
             count += 1
             print(count)
         except Exception as e:
             print(e)
示例#8
0
 def start_crawler(self):
     querystring = {"_method": "GetDataToDynamicInXml", "_session": "rw"}
     payload = "xmlInfo=%263Croot%2620QueryCode%263D%2622ProjectIntroduce%2622%2620PageIndex%263D%26221%2622%2620PageSize%263D%262215%2622%2620SortField%263D%2622%2620ORDER%2620BY%2620Name%2622%2620QueryString%263D%2622QueryCode%263DProjectIntroduce%2626amp%263BShowModeCode%263Ddefault%2622%2620BeginDate%263D%2622%262000%263A00%263A00%2622%2620EndDate%263D%2622%262023%263A59%263A59%2622%2620Flag%263D%2622TitleBody%2622%2620TitlesWidthInfo%263D%2622EnterPriseName%267C0%2624Name%267C0%2624Location%267C0%2624SoilUse%267C0%2622%2620IsUseOCache%263D%26220%2622%2620IsUserID%263D%26220%2622%2620SiteId%263D%26228907bd13-1d14-4f9e-8c01-e482d9590d10%2622%2620LockedColumn%263D%26220%2622%2620IsLocked%263D%26220%2622%2620ClientWidth%263D%26221601%2622%2620ShowModeCode%263D%2622default%2622%2620Language%263D%2622chinese%2622/%263E"
     response = requests.request("POST",
                                 url,
                                 data=payload,
                                 params=querystring)
     html = response.text
     comm_info_list = re.findall('class="tdctfield tdctwidthset ".*?</tr>',
                                 html, re.S | re.M)
     for i in comm_info_list:
         comm = Comm(co_index)
         comm.co_develops = re.search('class="spanctfield".*?>(.*?)<', i,
                                      re.S | re.M).group(1)
         comm.co_name = re.search(
             'class="spanctfield".*?class="spanctfield".*?<a.*?>(.*?)<', i,
             re.S | re.M).group(1)
         comm.co_address = re.search(
             'class="spanctfield".*?class="spanctfield".*?class="spanctfield".*?>(.*?)<',
             i, re.S | re.M).group(1)
         comm.co_type = re.search(
             'class="spanctfield".*?class="spanctfield".*?class="spanctfield".*?class="spanctfield".*?>(.*?)<',
             i, re.S | re.M).group(1)
         comm.co_id = re.search('EnterPriseName_(.*?)"', i,
                                re.S | re.M).group(1)
         comm.insert_db()
         self.get_build_info(comm.co_id)
示例#9
0
    def co_parse(self,url_list):
        for url in url_list:
            try:
                co_url = url.xpath("./@href")[0]
                new_url = "http://tmsf.qzfdcgl.com" + co_url
                co_res = requests.get(new_url,headers=self.headers)
                con = co_res.text
                co = Comm(co_index)
                co.co_id = re.search('property_(.*?)_info',co_url).group(1)
                co.co_name = re.search('楼盘名称:</span>(.*)',con).group(1)
                co.co_develops = re.search('项目公司:</span>(.*)',con).group(1)
                co.co_address = re.search('物业地址:</span>(.*?)</p',con,re.S|re.M).group(1)
                co.area = re.search('所属城区:</span>(.*)',con).group(1)
                co.insert_db()
                sid = re.search('property_(\d+)_',co_url).group(1)
                propertyid = re.search('(\d+)_info',co_url).group(1)
                bu_url = new_url.replace('info','price')
                res = requests.get(bu_url,headers=self.headers)
                bu_html = etree.HTML(res.text)
                bu_idlist = bu_html.xpath("//dd[@id='building_dd']/a")
            except:
                continue
            for bu_ in bu_idlist[1:]:
                id = bu_.xpath("./@id")[0]
                bu_id = re.search('.*?(\d+)',id).group(1)
                bu = Building(co_index)
                bu.bu_id = bu_id
                bu.co_id = co.co_id
                bu.bu_num = bu_.xpath("./text()")[0]

                bu.insert_db()
                self.house_parse(bu_id,co.co_id,sid,propertyid)
示例#10
0
 def comm_info(self, co_develops, co_pre_sale, co_name, co_pre_sale_date,
               sid):
     co = Comm(co_index)
     co.co_pre_sale = co_pre_sale
     co.co_id = sid
     co.co_name = co_name
     co.co_pre_sale_date = co_pre_sale_date
     co.co_develops = co_develops
     co.insert_db()
示例#11
0
    def start_crawler(self):
        b = AllListUrl(first_page_url=self.start_url,
                       request_method='get',
                       analyzer_type='regex',
                       encode='utf-8',
                       page_count_rule='共(\d+)页',
                       )
        page = b.get_page_count()
        for i in range(1,int(page)+1):
            url = self.start_url + '?pageIndex=2' + str(page)
            page_res = requests.get(url,headers=self.headers)

            html = etree.HTML(page_res.text)
            comm_info_list = html.xpath("//ul/li/div")
            for comm_info in comm_info_list:
                try:
                    co = Comm(co_index)
                    co.co_name = comm_info.xpath("./p/a/text()")[0]
                    deve = comm_info.xpath("./p[2]/text()")[0]
                    addr = comm_info.xpath("./p[3]/text()")[0]
                    co.co_develops = re.search('开发商:(.*)',deve).group(1)
                    co.co_address = re.search('楼盘地址.*?:(.*)',addr).group(1)
                    comm_url = comm_info.xpath("./p/a/@href")[0]
                    co.co_id = re.search('projectId=(\d+)',comm_url).group(1)
                    co.insert_db()
                    co_url = 'http://www.bdfdc.net' + comm_url
                    co_res = requests.get(co_url,headers=self.headers)
                    time.sleep(5)
                    bu_html = etree.HTML(co_res.text)
                    bu_url_list = bu_html.xpath("//div[@style]/a")[1:]
                except Exception as e:
                    # log.error("小区信息错误{}".format(e))
                    print("小区信息错误{}".format(e))
                    continue
                self.bu_info(bu_url_list,co.co_id)
示例#12
0
 def get_data_obj(self, analyzer, co_index):
     if analyzer == 'comm':
         return Comm(co_index)
     elif analyzer == 'build':
         return Building(co_index)
     elif analyzer == 'house':
         return House(co_index)
示例#13
0
 def comm(self, tag):
     co = Comm(co_index)
     co.co_name = tag.xpath("./td[@width='143']/a/text()")[0]
     co.area = tag.xpath("./td[@width='184']/text()")[0]
     co.co_develops = tag.xpath("./td[@width='192']/text()")[0]
     co_id = tag.xpath("./td/a/@href")[0]
     co.co_id = re.search('mmcid=(\d+)&', co_id).group(1)
     co.co_open_time = tag.xpath("./td[@width='95']/text()")[0]
     buid_all_url = "http://www.syfc.com.cn" + co_id
     co.insert_db()
     global count
     count += 1
     print(count)
     return buid_all_url, co.co_id
示例#14
0
文件: cixi_8.py 项目: w4205/hilder_gv
 def start(self):
     response = requests.get(self.url, headers=self.headers)
     html = response.text
     tree = etree.HTML(html)
     comm_url_list = tree.xpath('//ul[@class="NewsList"]/li/a/@href')
     for i in range(len(comm_url_list)):
         comm = Comm(8)
         comm_url = 'http://www.cxsfdcglzx.com/touming/' + comm_url_list[i]
         print(comm_url)
         self.get_comm_info(comm_url, comm)
示例#15
0
 def start_crawler(self):
     for i in self.url:
         res = requests.get(url=i)
         html = res.content.decode()
         c = Comm(self.co_index)
         c.co_name = re.search('楼盘名称:</h5></td><td><span>(.*?)<', html,
                               re.S | re.M).group(1)
         c.co_develops = re.search(
             '开发建设单位:</h5></td><td><span>(.*?)</span>', html,
             re.S | re.M).group(1)
         c.co_address = re.search('项目位置:</h5></td><td><span>(.*?)</span>',
                                  html, re.S | re.M).group(1)
         c.co_build_size = re.search(
             '建筑面积:</h5></td><td><span>(.*?)</span>', html,
             re.S | re.M).group(1)
         print(c.to_dict())
         c.insert_db()
示例#16
0
 def start(self):
     b = AllListUrl(first_page_url=self.url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='gbk',
                    page_count_rule='共(.*?)页', )
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         all_page_url = self.url + '&page=' + str(i)
         response = requests.get(url=all_page_url, headers=self.headers)
         html = response.text
         tree = etree.HTML(html)
         comm_url_list = tree.xpath('//dt[@class="name"]/a/@href')
         area_list = tree.xpath('//dl[@class="houseList_n"]/dd[3]/text()')
         for i in range(len(comm_url_list)):
             url = 'http://www.fzfgj.cn/' + comm_url_list[i]
             try:
                 comm = Comm(11)
                 comm.area = area_list[i].replace('所属区域:', '')
                 self.get_comm_info(url, comm)
             except BaseException as e:
                 print('小区错误,co_index={},url={}'.format(co_index, url), e)
示例#17
0
    def start_crawler(self):
        url_list = self.get_all_page_url()
        for url in url_list:
            res = requests.get(url, headers=self.headers)
            html = res.content.decode('gb2312')
            info_list = re.search('可售套数(.*?)<!--进行翻页显示和处理-->', html,
                                  re.S | re.M).group(1)
            for info in re.findall('<tr.*?</tr>', info_list, re.S | re.M):
                try:
                    comm = Comm(1)

                    comm_detail_url = re.search('<a href="(.*?)">', info,
                                                re.S | re.M).group(1)
                    comm_area = re.findall('<td align="center">(.*?)</td>',
                                           info, re.S | re.M)[1]
                    comm.area = comm_area
                    # href = 'http://www.bsfcj.com/PubInfo/' + 'lpxx.asp?qyxmbm=DBDHDADCDADADADFDDDBDCDJ000001'
                    href = 'http://www.bsfcj.com/PubInfo/' + comm_detail_url

                    comm = self.get_comm_detail(href, comm)
                    comm.insert_db()
                except Exception as e:
                    print('小区列表页解析有错,co_index={},'.format(self.co_index), e)
示例#18
0
 def get_comm_info(self, comm_info_list):
     for i in comm_info_list:
         try:
             comm = Comm(co_index)
             comm.co_name = re.search('<td>(.*?)</td>', i,
                                      re.S | re.M).group(1)
             comm.co_all_house = re.search('<td.*?<td>(.*?)</td>', i,
                                           re.S | re.M).group(1)
             comm.co_all_size = re.search('<td.*?<td.*?<td>(.*?)</td>', i,
                                          re.S | re.M).group(1)
             comm.insert_db()
         except Exception as e:
             print('小区错误,co_index={},html_str={}'.format(co_index, i), e)
示例#19
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://www.fjnpfdc.com/House/' + i
             comm.co_develops = '公司名称:.*?<td.*?>(.*?)<'
             comm.co_pre_sale = '预售许可证:.*?<td.*?>(.*?)<'
             comm.co_name = '项目名称:.*?<td.*?>(.*?)<'
             comm.co_address = '项目坐落:.*?<td.*?>(.*?)<'
             comm.co_use = '规划用途:.*?<td.*?>(.*?)<'
             comm.co_build_size = '建筑面积:.*?<td.*?>(.*?)<'
             comm.co_id = 'ProjectId=(.*?)&'
             p = ProducerListUrl(
                 page_url=comm_url,
                 request_type='get',
                 encode='gbk',
                 analyzer_rules_dict=comm.to_dict(),
                 current_url_rule="<a href='(BuildingInfo.*?)'",
                 analyzer_type='regex',
                 headers=self.headers)
             build_url_list = p.get_details()
             self.get_build_info(build_url_list)
         except Exception as e:
             print("co_index={},小区{}错误".format(co_index, i), e)
示例#20
0
 def get_comm_info(self, html):
     html_info = re.search('预售商品房住宅项目公示(.*?)</table>', html).group(1)
     comm_list = re.findall(
         '<td(.*?)ahref="(.*?)">(.*?)</a(.*?)<ahref="(.*?)">(.*?)</a></td><td(.*?)>(.*?)</td></tr>',
         html_info)
     for i in comm_list:
         try:
             comm = Comm(2)
             url = 'http://www.bjjs.gov.cn/' + i[1]
             self.get_comm_detail(url, comm)
             global count
             count += 1
             print(count)
         except Exception as e:
             print('小区错误,co_index={},url={}'.format(co_index, url), e)
示例#21
0
 def get_comm_info(self, comm_detail_url_list):
     for i in comm_detail_url_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://www.ndjsj.gov.cn' + i
             comm.co_develops = '公司名称:.*?<td.*?>(.*?)<'
             comm.co_name = '项目名称:.*?<td.*?>(.*?)<'
             comm.co_pre_sale = '预售许可证:.*?<td.*?>(.*?)<'
             comm.co_address = '项目坐落:.*?<td.*?>(.*?)<'
             comm.co_use = '规划用途:.*?<td.*?>(.*?)<'
             comm.co_size = '占地面积:.*?<td.*?>(.*?)<'
             comm.co_build_size = '建筑面积:.*?<td.*?>(.*?)<'
             p = ProducerListUrl(
                 page_url=comm_url,
                 request_type='get',
                 encode='utf-8',
                 analyzer_rules_dict=comm.to_dict(),
                 current_url_rule="(BuildingInfo\?BuildingId=.*?)'",
                 analyzer_type='regex',
                 headers=self.headers)
             build_url_list = p.get_details()
             self.get_build_info(build_url_list)
         except Exception as e:
             print('宁德小区错误,url={}'.format(comm_url), e)
示例#22
0
 def get_comm_detail(self, comm_detail_url):
     comm = Comm(co_index)
     try:
         response = requests.get(comm_detail_url, headers=self.headers)
         html = response.text
         comm.co_pre_sale = re.search('预售许可证号:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
         comm.co_land_use = re.search('土地使用权证号及用途:.*?<td.*?>(.*?)</td', html, re.S | re.M).group(1)
         comm.co_build_size = re.search('本期预售总建筑面积:.*?<td.*?>(.*?)</td', html, re.S | re.M).group(1)
         comm.co_all_house = re.search('本期总单元套数:.*?<td.*?>(.*?)</td', html, re.S | re.M).group(1)
         comm.co_pre_sale_date = re.search('发证日期:.*?<td.*?>(.*?)</td', html, re.S | re.M).group(1)
         return comm
     except Exception as e:
         print('小区详情错误,co_index={},url={}'.format(co_index, comm_detail_url), e)
         return comm
示例#23
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         comm = Comm(co_index)
         response = requests.get(comm_url, headers=self.headers)
         html = response.text
         comm.co_pre_sale = re.search('预销售许可证号.*?<td.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
         comm.co_develops = re.search('开发建设单位.*?<td.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
         comm.co_handed_time = re.search('发证日期.*?<td.*?>(.*?)<', html,
                                         re.S | re.M).group(1)
         comm.co_name = re.search('项 目 名 称.*?<td.*?>(.*?)<', html,
                                  re.S | re.M).group(1)
         comm.co_address = re.search('项 目 座 落.*?<td.*?>(.*?)<', html,
                                     re.S | re.M).group(1)
示例#24
0
 def start(self):
     page = self.get_all_page()
     for i in range(1, int(page) + 1):
         url = 'http://www.czhome.com.cn/complexPro.asp?page=' + str(
             i
         ) + '&districtID=0&projectAdr=&projectName=&buildingType=0&houseArea=0&averagePrice=0&selState=-1'
         response = requests.get(url, headers=self.headers)
         html = response.content.decode('gbk')
         tree = etree.HTML(html)
         comm_url_list = tree.xpath('//*[@id="Table8"]/tr/td[2]/a/@href')
         for i in range(len(comm_url_list)):
             try:
                 comm = Comm(7)
                 comm_url = 'http://www.czhome.com.cn/' + comm_url_list[i]
                 self.get_comm_info(comm_url, comm)
             except Exception as e:
                 print("co_index={},小区:{}无法提取".format(co_index, comm_url))
                 print(e)
示例#25
0
 def start_crawler(self):
     for i in self.area_list:
         data = {'districtID': i}
         res = requests.post(url='http://www.fangdi.com.cn/complexPro.asp',
                             data=data)
         html_str = res.content.decode('gbk')
         # 根据返回结果 获取每个地区的返回分页
         url_list = re.findall('value="(/complexpro.*?)"', html_str,
                               re.S | re.M)
         for k in url_list:
             response = requests.get('http://www.fangdi.com.cn' + k,
                                     headers=self.headers)
             html = response.content.decode('gbk')
             comm_html = re.search('位置<.*?页/共', html, re.S | re.M).group()
             comm_info_list = re.findall('<tr valign=.*?</tr>', comm_html,
                                         re.S | re.M)
             for info in comm_info_list:
                 try:
                     comm = Comm(co_index)
                     comm_url = re.search('<a href=(.*?)>', info,
                                          re.S | re.M).group(1)
                     comm.co_name = re.search('<a.*?>(.*?)<', info,
                                              re.S | re.M).group(1)
                     comm.co_address = re.search('<a.*?<td.*?>(.*?)<', info,
                                                 re.S | re.M).group(1)
                     comm.co_all_house = re.search(
                         '<a.*?<td.*?<td.*?>(.*?)<', info,
                         re.S | re.M).group(1)
                     comm.co_all_size = re.search(
                         '<a.*?<td.*?<td.*?<td.*?>(.*?)<', info,
                         re.S | re.M).group(1)
                     comm.area = re.search(
                         '<a.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', info,
                         re.S | re.M).group(1)
                     comm.co_id = re.search('projectID=(.*?)==', info,
                                            re.S | re.M).group(1)
                     self.get_comm_info(comm_url, comm)
                 except Exception as e:
                     print(
                         '小区错误,co_index={},url={}'.format(
                             co_index, 'http://www.fangdi.com.cn' + k), e)
示例#26
0
 def get_comm_info(self, all_html_list):
     for html in all_html_list:
         try:
             comm_info_paper_list = re.findall('<tr>.*?</tr>', html,
                                               re.S | re.M)
             for i in comm_info_paper_list[1:]:
                 comm = Comm(co_index)
                 comm.area = re.search('align="center">(.*?)<', i,
                                       re.S | re.M).group(1)
                 comm.co_name = re.search(
                     'align="center".*?align="center".*?>(.*?)<', i,
                     re.S | re.M).group(1)
                 comm.co_address = re.search(
                     'align="center".*?align="center".*?align="center".*?title="(.*?)"',
                     i, re.S | re.M).group(1)
                 comm.co_all_house = re.search(
                     'align="center".*?align="center".*?align="center".*?align="center".*?>(.*?)<',
                     i, re.S | re.M).group(1)
                 comm.co_id = re.search('projectID=(.*?)&', i,
                                        re.S | re.M).group(1)
                 comm.insert_db()
                 self.get_build_info(comm.co_id)
         except Exception as e:
             print('解析错误,co_index={},方法:get_comm_info'.format(co_index), e)
示例#27
0
 def start(self):
     page = self.get_all_page()
     for i in range(1, int(page) + 1):
         url = 'http://www.funi.com/loupan/region_0_0_0_0_' + str(i)
         response = self.request_proxy(url)
         html = response.text
         tree = etree.HTML(html)
         comm_url_list = tree.xpath('//dt[@class="clearfix"]/h2/a/@href')
         for i in comm_url_list:
             comm = Comm(co_index)
             i = i.split(';')
             if i:
                 i = i[0]
             detail_url = 'http://www.funi.com/' + i + '/detail.htm'
             comm_index_url = 'http://www.funi.com/' + i
             try:
                 comm = self.get_comm_info(comm_index_url, comm)
                 self.get_comm_detail(detail_url, comm)
             except Exception as e:
                 print('小区错误:co_index={},url={}'.format(co_index, detail_url), e)
示例#28
0
    def start(self):
        page = self.get_all_page()
        count = 0
        for i in range(1, int(page) + 1):
            try:
                url = 'http://www.czfdc.gov.cn/spf/gs.php?pageid=' + str(i)
                response = requests.get(url, headers=self.headers)
                html = response.content.decode('gbk')
                tree = etree.HTML(html)
                comm_url_list = tree.xpath('//td[@align="left"]/a/@href')

                for j in comm_url_list:
                    count += 1
                    print(count)
                    comm = Comm(6)
                    comm_url = 'http://www.czfdc.gov.cn/spf/' + j
                    self.get_comm_info(comm_url, comm)
            except Exception as e:
                print('co_index={},翻页有问题,url={}'.format(self.co_index, url), e)
                continue
示例#29
0
    def start(self):
        b = AllListUrl(first_page_url=self.url,
                       request_method='get',
                       analyzer_type='regex',
                       encode='utf-8',
                       page_count_rule='pageTotal = (.*?);', )

        page = b.get_page_count()
        for i in range(1, int(page) + 1):
            url = 'http://fsfc.fsjw.gov.cn/search/index.do?p=' + str(i)
            response = requests.get(url, headers=self.headers)
            html = response.text
            tree = etree.HTML(html)
            comm_url_list = tree.xpath('//*[@id="content"]/div[2]/div[1]/dl/dd/h3/a/@value')
            for i in comm_url_list:
                    comm = Comm(co_index)
                    url = 'http://fsfc.fsjw.gov.cn/hpms_project/roomView.jhtml?id=' + i
                    try:
                        response = requests.get(url, headers=self.headers)
                    except Exception as e:
                        print(e)
                        print("co_index={},小区详情页{}请求失败".format(co_index, url))
                        continue
                    self.get_comm_info(url,response, comm)
示例#30
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://old.newhouse.cnnbfdc.com/' + i
             response = requests.get(comm_url, headers=self.headers)
             html = response.text
             comm.co_name = re.findall('项目名称:.*?<span.*?>(.*?)<', html,
                                       re.S | re.M)[0].strip()
             comm.co_address = re.findall('项目地址:.*?<td.*?>(.*?)<', html,
                                          re.S | re.M)[0].strip()
             comm.co_develops = re.findall('开发公司:.*?<td.*?>(.*?)<', html,
                                           re.S | re.M)[0].strip()
             comm.co_pre_sale = re.findall('预\(现\)售证名称:.*?<td.*?>(.*?)<',
                                           html, re.S | re.M)[0].strip()
             comm.co_build_size = re.findall('纳入网上可售面积:.*?<img.*?>(.*?)<',
                                             html, re.S | re.M)[0].replace(
                                                 'm&sup2;', '').strip()
             comm.co_all_house = re.findall('纳入网上可售套数:.*?<img.*?>(.*?)<',
                                            html, re.S | re.M)[0].replace(
                                                '套', '').strip()
             comm.area = re.findall('所在区县:.*?<td.*?>(.*?)<', html,
                                    re.S | re.M)[0].strip()
             comm.co_id = re.findall('mobanshow.aspx\?projectid=(.*?)"',
                                     html, re.S | re.M)[0].strip()
             comm.insert_db()
             global count
             count += 1
             print(count)
             build_url_list = re.findall("window.open\('(.*?)'", html,
                                         re.S | re.M)
             bu_name_list = re.findall("window.open.*?<font.*?>(.*?)<",
                                       html, re.S | re.M)
             bu_all_house_list = re.findall("window.open.*?<td.*?>(.*?)<",
                                            html, re.S | re.M)
             qrykey = re.findall("qrykey=(.*?)&", html, re.S | re.M)
             for index in range(len(build_url_list)):
                 try:
                     build = Building(co_index)
                     build.bu_name = bu_name_list[index].strip()
                     build.bu_all_house = bu_all_house_list[index].strip()
                     build.co_id = comm.co_id
                     build.bu_id = qrykey[index].strip()
                     build.insert_db()
                 except Exception as e:
                     print(e)
             self.get_house_info(build_url_list)
         except Exception as e:
             print(e)