Пример #1
0
    def get_build_info(self, build_id_list, co_id):
        bu = Building(co_index)
        for build_id in build_id_list:
            formdata = {}
            formdata["action"] = "qeurySingleBuilding"
            formdata['pk'] = str(build_id)
            header = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
                'Referer':
                'http://hkrealestate.haikou.gov.cn/wp_myself/housequery/projectBuildingList.php'
            }
            try:
                build_info = self.s.post(
                    'http://hkrealestate.haikou.gov.cn/wp_myself/housequery/projectBuildHouseAction.php',
                    data=formdata,
                    headers=header)
            except Exception as e:
                print("co_idnex={},楼栋错误".format(co_index), e)

            build_con = build_info.text
            bu.bu_id = build_id
            bu.co_id = co_id
            bu.bu_num = re.search('幢名称.*?<td>(.*?)<', build_con,
                                  re.S | re.M).group(1)
            bu.bu_floor = re.search('总层数.*?<td>(.*?)<', build_con,
                                    re.S | re.M).group(1)
            bu.bu_build_size = re.search('>建筑面积.*?<td>(.*?)<', build_con,
                                         re.S | re.M).group(1)
            bu.bo_develops = re.search('房地产企业.*?">(.*?)</td', build_con,
                                       re.S | re.M).group(1)

            bu.insert_db()

            self.get_house_info(build_con, co_id, build_id)
Пример #2
0
 def bu_parse(self,bu_url,co_id,co_url):
     build_url = "http://61.143.241.154/" + bu_url
     global headers
     headers =  {
         'User-Agent':
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
         'Referer':
             co_url
     }
     bu_res = requests.get(build_url,headers=headers)
     bu_con = bu_res.content.decode('gbk')
     bu_pre_sale = re.search('预售许可证编号.*?blank">(.*?)</a',bu_con,re.S|re.M).group(1)
     bu_pre_sale_date = re.search('预售证有效日期.*?">(.*?)</td',bu_con,re.S|re.M).group(1)
     bu_html = etree.HTML(bu_con)
     bu_list = bu_html.xpath("//table[@id='donglist']//tr")
     for bo in bu_list:
         bu = Building(co_index)
         bu.co_id = co_id
         bo_url = bo.xpath("./td/a/@href")[0]
         bu.bu_id = re.search('dbh=(.*?)&', bo_url).group(1)
         bu.bu_num = bo.xpath("./td[3]/text()")[0]
         bu.bu_floor = bo.xpath("./td[4]/text()")[0]
         bu.bu_pre_sale = bu_pre_sale
         bu.bu_pre_sale_date = bu_pre_sale_date
         bu.insert_db()
         self.house_parse(bo_url,co_id,bu.bu_id)
Пример #3
0
    def build_info(self, bu_list, co_id):
        for bu in bu_list:
            bu_url = bu.xpath("./td[4]/a/@href")[0]
            build_url = self.start_url + '/' + bu_url
            bu_res = requests.get(build_url, headers=self.headers)
            bu_res.encoding = 'gbk'
            con = bu_res.text
            bu_pre_sale = re.search('预售许可证编号.*?blank">(.*?)</a', con,
                                    re.S | re.M).group(1)
            bu_pre_sale_date = re.search('预售证有效日期.*?">(.*?)</td', con,
                                         re.S | re.M).group(1)

            bu_html = etree.HTML(con)
            donglist = bu_html.xpath("//table[@id='donglist']/tr")
            for dong in donglist:
                dong_url = dong.xpath("./td/a/@href")[0]
                bu = Building(co_index)
                bu.co_id = co_id
                bu.bu_id = re.search('ID={(.*?)}', dong_url).group(1)
                bu.bu_num = dong.xpath("./td[3]/text()")[0]
                bu.bu_floor = dong.xpath("./td[4]/text()")[0]
                bu.bu_pre_sale = bu_pre_sale
                bu.bu_pre_sale_date = bu_pre_sale_date
                bu.insert_db()
                self.house_info(co_id, bu.bu_id, dong_url)
Пример #4
0
 def get_build_info(self, bu_address_list, bu_num_list, bu_floor_list,
                    bu_url_list, co_id):
     for i in range(len(bu_url_list)):
         build = Building(co_index)
         build.bu_address = bu_address_list[i]
         build.bu_num = bu_num_list[i]
         build.bu_floor = bu_floor_list[i]
         build.co_id = co_id
         # response = self.request_proxy('http://183.63.60.194:8808/public/web/' + bu_url_list[i])
         time.sleep(1)
         response = self.s.get('http://183.63.60.194:8808/public/web/' +
                               bu_url_list[i],
                               headers=self.headers)
         build.bu_id = re.search('ljzid=(.*?)$', bu_url_list[i]).group(1)
         build.insert_db()
         html = response.text
         house_html = re.search('var _table_html_.*?</script>', html,
                                re.S | re.M).group()
         house_url_list = re.findall('房屋号:<a.*?href="(.*?)"', house_html,
                                     re.S | re.M)
         try:
             self.get_house_info(house_url_list, build.bu_id)
         except Exception as e:
             print(
                 '房号错误,co_index={},url={}'.format(
                     co_index, 'http://183.63.60.194:8808/public/web/' +
                     bu_url_list[i]), e)
Пример #5
0
 def get_build_info(self, co_id):
     build_url = "http://202.103.219.149:7000/ajax/LeadingMIS.CommonModel.CommonQuery.WebUI.AjaxManage.QueryDataParser,LeadingMIS.CommonModel.CommonQuery.WebUI.ashx"
     querystring = {"_method": "GetDataToDynamicInXml", "_session": "rw"}
     payload = "xmlInfo=%263Croot%2620QueryCode%263D%2622BuildingsInfo%2622%2620PageIndex%263D%26221%2622%2620PageSize%263D%262215%2622%2620SortField%263D%2622%2620ORDER%2620BY%2620Name%2622%2620QueryString%263D%2622QueryCode%263DBuildingsInfo%2626amp%263BProjectID%263D" + co_id + "%2622%2620BeginDate%263D%2622%262000%263A00%263A00%2622%2620EndDate%263D%2622%262023%263A59%263A59%2622%2620Flag%263D%2622TitleBody%2622%2620TitlesWidthInfo%263D%2622BuildNo%267C0%2624Name%267C0%2624FloorCount%267C0%2624RoomCount%267C0%2624YCJZArea%267C0%2624Structure%267C0%2624YSXKCer%267C0%2624ZJJG%267C0%2622%2620IsUseOCache%263D%26220%2622%2620IsUserID%263D%26220%2622%2620SiteId%263D%26228907bd13-1d14-4f9e-8c01-e482d9590d10%2622%2620LockedColumn%263D%26220%2622%2620IsLocked%263D%26220%2622%2620ClientWidth%263D%26221601%2622%2620ShowModeCode%263D%2622default%2622%2620Language%263D%2622chinese%2622/%263E"
     try:
         response = requests.request("POST",
                                     build_url,
                                     data=payload,
                                     params=querystring)
         html = response.text
         build_info_list = re.findall('<tr.*?>.*?</tr>', html,
                                      re.S | re.M)[1:]
         for i in build_info_list:
             build = Building(co_index)
             build.co_id = co_id
             build.bu_num = re.search(
                 '<span class="spanctfield".*?<span class="spanctfield".*?>.*?<a.*?>(.*?)<',
                 i, re.S | re.M).group(1)
             build.bu_floor = re.search(
                 '<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?>(.*?)<',
                 i, re.S | re.M).group(1)
             build.bu_pre_sale = re.search(
                 '<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?>(.*?)<',
                 i, re.S | re.M).group(1)
             build.bu_id = re.search('id="Tr_(.*?)"', i,
                                     re.S | re.M).group(1)
             build.insert_db()
             self.get_house_info(co_id, build.bu_id)
     except Exception as e:
         print('请求错误,url={},data={},params={}'.format(
             build_url, payload, querystring))
Пример #6
0
 def get_build_info(self, build_url_list, comm):
     for i in build_url_list:
         try:
             build_url = 'http://58.51.240.121:8503/' + i
             response = requests.get(build_url, headers=self.headers)
             html = response.text
             comm.co_pre_sale = re.search(
                 'id="PresellInfo1_lblXkzh">(.*?)<', html,
                 re.S | re.M).group(1)
             comm.co_pre_sale_date = re.search(
                 'id="PresellInfo1_lblFzrq">(.*?)<', html,
                 re.S | re.M).group(1)
             comm.insert_db()
             build_info_list = re.findall('<tr bgcolor="#FFFFFF">.*?</tr>',
                                          html, re.S | re.M)
             for i in build_info_list:
                 build = Building(co_index)
                 build.co_id = comm.co_id
                 build.bu_num = re.search('<td.*?>(.*?)<', i,
                                          re.S | re.M).group(1)
                 build.bu_floor = re.search('<td.*?<td.*?>(.*?)<', i,
                                            re.S | re.M).group(1)
                 build.bu_all_house = re.search('<td.*?<td.*?<td.*?>(.*?)<',
                                                i, re.S | re.M).group(1)
                 build.bu_id = re.search('PresellId=(.*?)$',
                                         build_url).group(1)
                 build.insert_db()
                 house_url = re.search('a href="(.*?)"', i,
                                       re.S | re.M).group(1)
                 self.get_house_info(house_url, comm.co_id, build.bu_id)
         except Exception as e:
             print('请求错误,co_index={},url={}'.format(co_index, build_url), e)
Пример #7
0
 def build_info(self, build_detail, co_id):
     proxy = Proxy_contact(app_name='wuhan',
                           method='get',
                           url=build_detail,
                           headers=self.headers)
     # build_res = requests.get(build_detail,headers=self.headers)
     build_res = proxy.contact()
     html = etree.HTML(build_res.decode('gb18030'))
     info_list = html.xpath("//tr[@bgcolor='#FFFFFF']")
     for info in info_list:
         try:
             bu = Building(co_index)
             bu.co_id = co_id
             bu.bu_floor = info.xpath('./td[3]/text()')[0]
             bu.bu_all_house = info.xpath('./td[4]/text()')[0]
             bu.bu_num = info.xpath('./td//span/text()')[0]
             temp_url = info.xpath('./td/a/@href')[0]
             bu.bu_id = re.search('HouseDengjh=(.*?\d+)', temp_url).group(1)
             bu.insert_db()
         except Exception as e:
             log.error('楼栋错误{}'.format(e))
             continue
         a = parse.quote(re.search('DengJh=(.*?\d+)&', temp_url).group(1),
                         encoding='gbk')
         b = parse.quote(re.search('HouseDengjh=(.*?\d+)',
                                   temp_url).group(1),
                         encoding='gbk')
         bu_url = 'http://scxx.fgj.wuhan.gov.cn/5.asp?DengJh=' + a + '&HouseDengjh=' + b
         self.house_info(bu.bu_id, bu_url, co_id)
         time.sleep(3)
Пример #8
0
    def get_build_info(self, comm_url_list):
        for i in comm_url_list:
            try:
                sid = re.findall('\+(\d+)\+', i)[0]
                pid = re.findall('\+(\d+)\+', i)[1]
                build_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/bldg_query.aspx?pid=' + pid + '&sid=' + sid
                # print(build_url)
                response = requests.get(build_url)
                html = response.text
                build = Building(co_index)
                build.bu_id = pid
                build.bu_num = re.search('楼栋座落.*?<td.*?>(.*?)<', html,
                                         re.S | re.M).group(1)
                build.bu_address = re.search('楼栋座落.*?<td.*?>(.*?)<', html,
                                             re.S | re.M).group(1)
                build.bu_pre_sale = re.search('预售证号.*?">(.*?)&nbsp', html,
                                              re.S | re.M).group(1)
                build.bu_pre_sale_date = re.search('时间.*?">(.*?)&nbsp', html,
                                                   re.S | re.M).group(1)
                build.bu_all_house = re.search('dM.*?">(.*?)&nbsp', html,
                                               re.S | re.M).group(1)
                # build.bu_address = re.search('售楼处地址.*?">(.*?)&nbsp', html, re.S | re.M).group(1)
                build.insert_db()
            except Exception as e:
                print('co_index={}, 楼栋错误,url={}'.format(co_index, build_url),
                      e)

            house_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/proxp.aspx?key=WWW_LPB_001&params=' + sid
            # print(house_url)
            result = requests.get(house_url)
            html_ = result.text

            for house_info in re.findall('<Result.*?</Result>', html_,
                                         re.S | re.M):
                try:
                    house = House(co_index)
                    house.bu_id = build.bu_id
                    house.bu_num = build.bu_num
                    house.ho_name = re.search('<ONAME>(.*?)</ONAME>',
                                              house_info, re.S | re.M).group(1)
                    house.ho_num = re.search('<OSEQ>(.*?)</OSEQ>', house_info,
                                             re.S | re.M).group(1)
                    house.ho_build_size = re.search('<BAREA>(.*?)</BAREA>',
                                                    house_info,
                                                    re.S | re.M).group(1)
                    house.ho_floor = re.search('<FORC>(.*?)</FORC>',
                                               house_info,
                                               re.S | re.M).group(1)
                    house.ho_true_size = re.search('<PAREA>(.*?)</PAREA>',
                                                   house_info,
                                                   re.S | re.M).group(1)
                    house.insert_db()
                except Exception as e:
                    print('co_index={}, 房号错误'.format(co_index), e)
Пример #9
0
    def get_build_info(self, co_id, co_name):
        url = 'http://www.czhome.com.cn/Presell.asp?projectID=' + co_id + '&projectname=' + co_name
        response = requests.get(url, headers=self.headers)
        html = response.content.decode('gbk')
        tree = etree.HTML(html)
        xpath_list = tree.xpath('//tr[@class="indextabletxt"]')
        for i in xpath_list[1:]:
            build_url = i.xpath('td[2]/a/@href')[0]
            url = 'http://www.czhome.com.cn/' + build_url
            result = requests.get(url, headers=self.headers)
            if result.status_code is not 200:
                print("co_index={},预售url:{}连接失败".format(co_index, url))
                continue
            html = result.content.decode('gbk')
            tree = etree.HTML(html)
            # 总套数
            bu_xpath = tree.xpath('/html/body/table/tr/td/table/tr/td/table/tr')[1:]
            for i in bu_xpath:
                try:
                    building = Building(7)
                    global building_id
                    building_id += 1
                    building.bu_id = building_id
                    bu_all_house = i.xpath('td[7]/text()')[0]
                    bu_url = i.xpath('td[1]/a/@href')[0]
                    url = 'http://www.czhome.com.cn/' + bu_url
                    response = requests.get(url, headers=self.headers)
                    if response.status_code is not 200:
                        print("co_index={},楼栋url:{}连接失败".format(co_index, url))
                        continue
                    html = response.content.decode('gbk')
                    tree = etree.HTML(html)
                    # 楼层
                    bu_floor = tree.xpath('//*[@id="Table4"]/tr[2]/td/table[3]/tr/td[1]/u/text()')[-1]
                    house_url_list = tree.xpath('//*[@id="Table4"]/tr[2]/td/table[3]/tr/td/a/@href')
                    bu_address = re.search('<center><font color=.*?&nbsp;&nbsp;(.*?)<', html, re.S | re.M).group(1)
                    building.bu_all_house = bu_all_house
                    building.bu_address = bu_address
                    building.bu_floor = bu_floor
                    building.bu_id = building_id
                    building.co_id = co_id
                    building.insert_db()
                    for i in house_url_list:
                        try:
                            house = House(7)
                            house_url = 'http://www.czhome.com.cn/' + i
                            self.get_house_info(house_url, house, co_id, building_id, building)
                        except Exception as e:
                            print(e)


                except Exception as e:
                    print(e)
Пример #10
0
 def get_build_info(self, build_logo_list, preid):
     for build_logo in build_logo_list:
         try:
             build_url = 'https://www.qdfd.com.cn/qdweb/realweb/fh/FhBuildingList.jsp?preid=' + build_logo
             response = requests.get(build_url, headers=self.headers)
             html = response.text
             bu_num_list = re.findall(
                 'javascript:showHouseStatus.*?>(.*?)</a', html,
                 re.S | re.M)
             bu_all_house_list = re.findall(
                 'javascript:showHouseStatus.*?center.*?center.*?center.*?center.*?center.*?>(.*?)<',
                 html, re.S | re.M)
             house_code_list = re.findall(
                 "javascript:showHouseStatus\((.*?)\)'>", html, re.S | re.M)
             for i in range(len(bu_num_list)):
                 try:
                     build = Building(co_index)
                     bu_code_list = re.findall('"(.*?)"',
                                               house_code_list[i])
                     build.bu_num = bu_num_list[i]
                     build.bu_all_house = bu_all_house_list[i]
                     build.co_id = preid
                     build.bu_id = bu_code_list[0]
                     build.insert_db()
                     co_id = bu_code_list[2]
                     house_id = bu_code_list[1]
                     self.get_house_info(build.bu_id, co_id, house_id)
                 except Exception as e:
                     print(e)
         except Exception as e:
             print('青岛楼栋问题,url:={}'.format(build_url), e)
Пример #11
0
 def bu_info(self, bu_list, co_id):
     for bu_ in bu_list[1:]:
         bu = Building(co_index)
         bu.co_id = co_id
         bu.bu_num = bu_.xpath("./td/a/text()")[0]
         bu.bu_pre_sale = bu_.xpath("./td[2]/text()")[0]
         bu.bu_type = bu_.xpath("./td[4]/text()")[0]
         bu_url = bu_.xpath("./td/a/@href")[0]
         bu.bu_id = re.search('buildid=(\d+)', bu_url).group(1)
         bu.insert_db()
         self.ho_info(bu_url, co_id, bu.bu_id)
Пример #12
0
    def bu_parse(self, co_id, page, co_url, co_res, path_url):
        html = etree.HTML(co_res.text)
        viewstate = html.xpath("//input[@id='__VIEWSTATE']/@value")[0]
        generator = html.xpath("//input[@id='__VIEWSTATEGENERATOR']/@value")[0]
        valid = html.xpath("//input[@id='__EVENTVALIDATION']/@value")[0]
        formdata = {
            "__VIEWSTATE": viewstate,
            "__EVENTTARGET": 'ctl00$MainContent$OraclePager1$ctl11$PageList',
            "__VIEWSTATEGENERATOR": generator,
            "__EVENTVALIDATION": valid,
            "ctl00$MainContent$OraclePager1$ctl11$PageList": 0
        }
        self.headers['Referer'] = co_url

        for i in range(1, int(page) + 1):
            page_res = requests.post(co_url,
                                     data=formdata,
                                     headers=self.headers)
            page_html = etree.HTML(page_res.text)
            view_state = html.xpath("//input[@id='__VIEWSTATE']/@value")[0]
            generator_ = html.xpath(
                "//input[@id='__VIEWSTATEGENERATOR']/@value")[0]
            valid_ = html.xpath("//input[@id='__EVENTVALIDATION']/@value")[0]
            formdata = {
                "__VIEWSTATE": view_state,
                "__EVENTTARGET":
                'ctl00$MainContent$OraclePager1$ctl11$PageList',
                "__VIEWSTATEGENERATOR": generator_,
                "__EVENTVALIDATION": valid_,
                "ctl00$MainContent$OraclePager1$ctl11$PageList": i - 1
            }

            bu_list = page_html.xpath(
                "//table[@id='ctl00_MainContent_OraclePager1']//tr")

            for bu in bu_list[1:]:
                build = Building(co_index)
                build.co_id = co_id
                build.bu_num = bu.xpath("./td/a/text()")[0]
                build.bu_build_size = bu.xpath("./td[2]/text()")[0]
                build.bu_floor = bu.xpath("./td[4]/text()")[0]
                build.bu_all_house = bu.xpath("./td[3]/text()")[0]
                tmp_url = bu.xpath("./td/a/@href")[0]
                build.bu_id = re.search('PBTAB_ID=(.*?)&', tmp_url).group(1)
                build.insert_db()
                house_url = path_url.replace('SaleInfoProListIndex.aspx',
                                             '') + tmp_url
                self.ho_parse(co_id, build.bu_id, house_url)
Пример #13
0
 def get_data_obj(self, analyzer, co_index):
     if analyzer == 'comm':
         return Comm(co_index)
     elif analyzer == 'build':
         return Building(co_index)
     elif analyzer == 'house':
         return House(co_index)
Пример #14
0
 def build_info(self, bu_list, co_id):
     for bo in bu_list:
         ho_url = bo.xpath("./@href")[0]
         floor = bo.xpath(".//p[2]/text()")[0]
         bu = Building(co_index)
         bu.bu_pre_sale = bo.xpath(".//p[3]/text()")[0]
         bu.bu_num = re.search('zh=(.*?)', ho_url).group(1)
         bu.bu_id = re.search('n=(\d+)', ho_url).group(1)
         bu.co_id = co_id
         bu.bu_floor = re.search('总层数.*?(\d+)', floor).group(1)
         bu.insert_db()
         house_url = "http://www.ggsfcw.com/" + ho_url
         self.ho_info(house_url, co_id, bu.bu_id)
Пример #15
0
 def build_info(self, bu_info_list, co_id):
     for bu_info in bu_info_list:
         try:
             bu = Building(co_index)
             url = bu_info.xpath("./@onclick")[0]
             bu.bu_id = re.search('dbh=(\d+)', url).group(1)
             bu.co_id = co_id
             bu.bu_num = bu_info.xpath("./td[@class='org']/text()")[0]
             bu.bu_all_house = bu_info.xpath("./td[3]/text()")[0]
             bu.size = bu_info.xpath("./td[2]/text()")[0]
             bu.insert_db()
         except Exception as e:
             log.error('楼栋信息错误', e)
Пример #16
0
 def get_comm_info(self, comm_id_list):
     for i in comm_id_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://web.xxfdc.gov.cn/onlineQuery/projectInformation.do?xmId=' + i
             response = requests.get(comm_url, headers=self.headers)
             html = response.text
             comm.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
             comm.co_address = re.search('项目地址:.*?<td.*?>(.*?)<', html,
                                         re.S | re.M).group(1)
             comm.co_develops = re.search('开发商:.*?<td.*?>(.*?)<', html,
                                          re.S | re.M).group(1)
             comm.co_all_house = re.search('已售总套数:.*?<td.*?>(.*?)<', html,
                                           re.S | re.M).group(1)
             comm.co_build_size = re.search('已售总面积:.*?<td.*?>(.*?)<', html,
                                            re.S | re.M).group(1)
             comm.area = re.search('行政区别:.*?<td.*?>(.*?)<', html,
                                   re.S | re.M).group(1)
             comm.co_volumetric = re.search('容积率:.*?<td.*?>(.*?)<', html,
                                            re.S | re.M).group(1)
             comm.co_id = i
             comm.insert_db()
             bu_html = re.search(
                 '<table class="table table-bordered itemInfoDetail.*?</table>',
                 html, re.S | re.M).group()
             build_info_list = re.findall('<tr>.*?</tr>', bu_html,
                                          re.S | re.M)[1:]
             for i in build_info_list:
                 try:
                     build = Building(co_index)
                     build.bu_num = re.search('<td>(.*?)<', i,
                                              re.S | re.M).group(1)
                     build.bu_all_house = re.search(
                         '<td>.*?<td>.*?<td>(.*?)<', i,
                         re.S | re.M).group(1)
                     build.bu_id = re.search('buildId=(.*?)&', i,
                                             re.S | re.M).group(1)
                     build.co_id = comm.co_id
                     build.insert_db()
                     house_url = re.search('<a href="(.*?)"', bu_html,
                                           re.S | re.M).group(1)
                     response = requests.get(house_url,
                                             headers=self.headers)
                     html = response.text
                     house_url_list = re.findall(
                         '<td width="110">.*?<a.*?href="(.*?)"', html,
                         re.S | re.M)
                     self.get_house_info(house_url_list, build.bu_id,
                                         comm.co_id)
                 except Exception as e:
                     print(
                         '楼栋错误,co_index={},url={}'.format(
                             co_index, house_url), e)
         except Exception as e:
             print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
Пример #17
0
 def get_comm_detail(self, detail_url, area):
     try:
         comm = Comm(co_index)
         comm_detail_url = 'http://www.yfci.gov.cn:8080/HousePresell/' + detail_url
         response = requests.get(comm_detail_url, headers=self.headers)
         html = response.text
         comm.co_develops = re.search('id="kfsmc".*?<a.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
         comm.co_name = re.search('id="PresellName".*?<a.*?>(.*?)<', html,
                                  re.S | re.M).group(1)
         comm.co_address = re.search('id="HouseRepose".*?>(.*?)<', html,
                                     re.S | re.M).group(1)
         comm.co_build_size = re.search('id="PresellArea".*?>(.*?)<', html,
                                        re.S | re.M).group(1)
         comm.co_all_house = re.search('id="djrqtd".*?>(.*?)<', html,
                                       re.S | re.M).group(1)
         comm.co_land_use = re.search('id="landinfo".*?>(.*?)<', html,
                                      re.S | re.M).group(1)
         comm.co_type = re.search('id="zczjtd".*?>(.*?)<', html,
                                  re.S | re.M).group(1)
         comm.co_pre_sale = re.search('id="bookid".*?<a.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
         comm.co_pre_sale_date = re.search('id="FZDatebegin".*?>(.*?)<',
                                           html, re.S | re.M).group(1)
         comm.co_open_time = re.search('id="kpdate".*?>(.*?)<', html,
                                       re.S | re.M).group(1)
         comm.co_id = re.search('FD=(.*?)&', detail_url,
                                re.S | re.M).group(1)
         comm.area = area
         comm.insert_db()
         build_html = re.search('id="donglist".*?</table>', html,
                                re.S | re.M).group()
         build_info_list = re.findall('<tr.*?</tr>', build_html,
                                      re.S | re.M)
         for i in build_info_list:
             build = Building(co_index)
             build.co_id = comm.co_id
             build.bu_address = re.search('<td.*?<td.*?>(.*?)<', i,
                                          re.S | re.M).group(1)
             build.bu_num = re.search('<td.*?<td.*?<td.*?>(.*?)<', i,
                                      re.S | re.M).group(1)
             build.bu_floor = re.search('<td.*?<td.*?<td.*?<td.*?>(.*?)<',
                                        i, re.S | re.M).group(1)
             house_url = re.search('href="(.*?)"', i, re.S | re.M).group(1)
             build.bu_id = re.search("LID=(.*?)$", house_url,
                                     re.S | re.M).group(1)
             build.insert_db()
             self.get_house_info(house_url, comm.co_id, build.bu_id)
     except Exception as e:
         print('小区错误,co_index={},url={}'.format(co_index, comm_detail_url),
               e)
Пример #18
0
 def get_comm_info(self, comm_info_list):
     for i in comm_info_list:
         build = Building(co_index)
         house_url = re.search('href="(.*?)"', i, re.S | re.M).group(1)
         build.bu_num = re.search('<a.*?>(.*?)<', i, re.S | re.M).group(1)
         build.bu_address = re.search('<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1)
         build.bu_pre_sale = re.search('<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1)
         build.bu_id = re.search('slbh=(.*?)&', i, re.S | re.M).group(1)
         build.insert_db()
         self.get_house_info(house_url, build.bu_id)
Пример #19
0
    def co_parse(self, url_list):
        for url in url_list:
            try:
                co_url = url.xpath("./@href")[0]
                new_url = "http://tmsf.qzfdcgl.com" + co_url
                co_res = requests.get(new_url, headers=self.headers)
                con = co_res.text
                co = Comm(co_index)
                co.co_id = re.search('property_(.*?)_info', co_url).group(1)
                co.co_name = re.search('楼盘名称:</span>(.*)', con).group(1)
                co.co_develops = re.search('项目公司:</span>(.*)', con).group(1)
                co.co_address = re.search('物业地址:</span>(.*?)</p', con,
                                          re.S | re.M).group(1)
                co.area = re.search('所属城区:</span>(.*)', con).group(1)
                co.insert_db()
                sid = re.search('property_(\d+)_', co_url).group(1)
                propertyid = re.search('(\d+)_info', co_url).group(1)
                bu_url = new_url.replace('info', 'price')
                res = requests.get(bu_url, headers=self.headers)
                bu_html = etree.HTML(res.text)
                bu_idlist = bu_html.xpath("//dd[@id='building_dd']/a")
            except:
                continue
            for bu_ in bu_idlist[1:]:
                id = bu_.xpath("./@id")[0]
                bu_id = re.search('.*?(\d+)', id).group(1)
                bu = Building(co_index)
                bu.bu_id = bu_id
                bu.co_id = co.co_id
                bu.bu_num = bu_.xpath("./text()")[0]

                bu.insert_db()
                self.house_parse(bu_id, co.co_id, sid, propertyid)
Пример #20
0
 def bu_parse(self, detail_url, co_id):
     pre_url = detail_url.replace('lp', 'presell')
     pre_res = requests.get(pre_url, headers=self.headers)
     pre_html = etree.HTML(pre_res.text)
     bu_pre_list = pre_html.xpath("//dt/strong/a")
     for bu_pre in bu_pre_list:
         bu_pre_url = bu_pre.xpath("./@href")[0]
         bu_pre_sale = bu_pre.xpath("./text()")[0]
         bu_url = 'http://www.zstmsf.com' + bu_pre_url
         while True:
             try:
                 proxy = self.proxies[random.randint(0, 9)]
                 bu_res = requests.get(bu_url,
                                       headers=self.headers,
                                       proxies=proxy,
                                       timeout=10)
                 break
             except:
                 continue
         bu_html = etree.HTML(bu_res.text)
         bu_list = bu_html.xpath("//tr//strong/a/@href")
         for bo_url in bu_list:
             ho_url = "http://www.zstmsf.com" + bo_url
             while True:
                 try:
                     proxy = self.proxies[random.randint(0, 9)]
                     ho_res = requests.get(ho_url,
                                           headers=self.headers,
                                           proxies=proxy,
                                           timeout=10)
                     break
                 except:
                     continue
             build = Building(co_index)
             build.co_id = co_id
             build.bu_id = re.search('zid=.*?(\d+)', ho_url).group(1)
             build.bu_num = re.search('幢名称:<strong>(.*?)<',
                                      ho_res.text).group(1)
             build.bu_all_house = re.search("幢总套数.*?'>(.*?)</",
                                            ho_res.text).group(1)
             build.bu_all_size = re.findall("面积.*?'>(.*?)</",
                                            ho_res.text)[0]
             build.bu_pre_sale = bu_pre_sale
             build.insert_db()
             self.ho_parse(co_id, build.bu_id, ho_res)
Пример #21
0
 def build_info(self,build_detail,co_id):
     build_detail_url = 'http://as.gzfcxx.cn' + build_detail
     res = requests.get(build_detail_url,headers=self.headers)
     html = etree.HTML(res.text)
     build_info_list = html.xpath("//div[@class='box']//font/a/@href")
     for build_url in build_info_list:
         try:
             url = 'http://as.gzfcxx.cn'+build_url
             ho_res = requests.get(url,headers=self.headers)
             ho_html = etree.HTML(ho_res.text)
             bu = Building(co_index)
             bu.co_id = co_id
             bu.bu_id = re.search('dongID=(\d+)',build_url).group(1)
             bu.bu_num = ho_html.xpath("//option[@selected='selected']/text()")[0]
             bu.insert_db()
             temp  = re.search("\?(.*?dongID=\d+)", build_url).group(1)
             real_url = 'http://as.gzfcxx.cn/Controls/HouseControls/FloorView.aspx?' + temp
             house_res = requests.get(real_url,headers=self.headers)
             ho_html = etree.HTML(house_res.text)
             info = ho_html.xpath("//table[@class='C1 T0 F0']/..")
         except Exception as e:
             log.error('楼栋信息错误',e)
             continue
         for i in info:
             try:
                 ho = House(co_index)
                 ho_info = i.xpath("./@title")[0]
                 ho.ho_build_size = re.search('(\d+).(\d+)',ho_info,re.S|re.M).group(1)
                 ho.ho_name = i.xpath(".//span/text()")[0]
                 ho.bu_id = bu.bu_id
                 ho.co_id = co_id
                 ho.insert_db()
             except Exception as e:
                 log.error('房间信息错误',e)
Пример #22
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         comm_url = "http://www.kmhouse.org" + i
         try:
             co_id = re.search("PreId=(.*?)&", i).group(1)
             s = re.search('prename=(.*?)$', comm_url, re.S | re.M).group(1)
             s_decode_str = parse.quote(s, encoding='gbk')
             comm_url = comm_url.replace(s, s_decode_str)
             response = requests.get(comm_url, headers=self.headers)
             html = response.content.decode('gbk')
             comm_detail_url = re.findall('linkone" href="(.*?)"', html,
                                          re.S | re.M)[0]
             self.get_comm_detail(comm_detail_url)
             build_html = re.search('请选择幢号.*?</select>', html,
                                    re.S | re.M).group()
             bu_info_list = re.findall("<option.*?</option>", build_html,
                                       re.S | re.M)
             for info in bu_info_list:
                 build = Building(co_index)
                 build.bu_id = re.search("value='(.*?)'", info,
                                         re.S | re.M).group(1)
                 build.bu_num = re.search("<option.*?>(.*?)<", info,
                                          re.S | re.M).group(1)
                 build.co_id = co_id
                 build.insert_db()
                 self.get_build_info(build.bu_id, co_id)
         except Exception as e:
             print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
Пример #23
0
    def get_build_info(self, build_url_list, bu_pre_sale_list, co_name, co_id):
        for i in range(len(build_url_list)):
            try:
                build = Building(co_index)
                build.co_id = co_id

                build.co_name = co_name
                build.bu_pre_sale = bu_pre_sale_list[i]
                build.bu_id = re.search('lh=(\d+)', build_url_list[i]).group(1)
                build_url = 'http://221.2.144.162:8090/' + build_url_list[i]
                response = requests.get(build_url, headers=self.headers)
                html = response.content.decode('gbk')
                build.bu_num = re.findall('<font color=white.*?><b>(.*?)<',
                                          html, re.S | re.M)[0]
                build.bu_address = re.findall('坐落位置:</b>(.*?)<', html,
                                              re.S | re.M)[0]
                build.insert_db()
                ho_url_list = re.findall('background-.*?href=(.*?) ', html,
                                         re.S | re.M)
                ho_name_list = re.findall('background-color.*?<a.*?>(.*?)<',
                                          html, re.S | re.M)
                for i in range(len(ho_url_list)):
                    try:
                        house = House(co_index)
                        house_url = 'http://221.2.144.162:8090/' + ho_url_list[
                            i]
                        result = requests.get(
                            house_url,
                            headers=self.headers).content.decode('gbk')
                        house.bu_id = build.bu_id
                        house.co_id = co_id
                        house.ho_type = re.findall(
                            '用&nbsp;&nbsp;&nbsp;途:.*?<td.*?>(.*?)<', result,
                            re.S | re.M)[0]
                        house.ho_build_size = re.findall(
                            '建筑面积:.*?<td>(.*?)<', result, re.S | re.M)[0]
                        house.bu_num = build.bu_num
                        house.co_name = co_name
                        house.ho_name = ho_name_list[i]
                        house.insert_db()
                    except Exception as e:
                        print("co_index={},房屋信息错误".format(co_index), e)
            except Exception as e:
                print("co_index={},楼栋信息错误".format(co_index), e)
Пример #24
0
 def get_build_info(self, build_info_list, co_id):
     for i in build_info_list:
         try:
             build = Building(co_index)
             build.bu_num = re.search('<td>(.*?)</td>', i,
                                      re.S | re.M).group(1)
             build.bu_all_house = re.search('<td>.*?<td>(.*?)</td>', i,
                                            re.S | re.M).group(1)
             build.bu_all_size = re.search('<td>.*?<td>.*?<td>(.*?)</td>',
                                           i, re.S | re.M).group(1)
             build.bu_id = re.search('\?id=(.*?)"', i, re.S | re.M).group(1)
             build.co_id = co_id
             build.insert_db()
             house_url = re.search('href="(.*?)"', i, re.S | re.M).group(1)
             self.get_house_info(house_url, co_id, build.bu_id)
         except Exception as e:
             print('楼栋错误,co_index={},str={}'.format(co_index, i), e)
Пример #25
0
 def build_parse(self, co_id):  # 楼栋信息解析
     bu = Building(co_index)
     build_info_url = "http://ys.tyfdc.gov.cn/Firsthand/tyfc/publish/ProNBList.do"
     formdata = {"pid": co_id, "pageNo": "1", "pageSize": "50"}
     res = requests.post(build_info_url,
                         data=formdata,
                         headers=self.headers)
     con = res.text
     info = re.findall('<tr objid.*?</tr>', con, re.S | re.M)
     for i in info:
         bu.co_id = co_id
         bu.bu_id = re.search('objid="(\d+)"', i).group(1)
         bu.bu_num = re.findall('<span>(.*?)<', i)[1]
         bu.bu_floor = re.search('<td>(\d+)\(', i).group(1)
         bu.bu_address = re.findall('<td>(.*?)</td>', i)[-1]
         bu.insert_db()
         self.house_parse(bu.bu_id, co_id)
Пример #26
0
 def get_build_info(self, build_url_list, co_id):
     bu_code_list = build_url_list[0].split(';;')
     for i in bu_code_list:
         build = Building(co_index)
         build.co_id = co_id
         code = i.split(',,')
         build.bu_id = code[0]
         build.bu_num = code[1]
         build.insert_db()
         self.get_house_info(build.bu_id)
Пример #27
0
 def build_parse(self, co_id):
     bu_url = "http://www.zyfgj.org/spf/GetBTable.ashx"
     bu_data = {"itemRecord": co_id, "houseCode": 0}
     res = requests.post(bu_url, data=bu_data, headers=self.headers)
     con = res.content.decode()
     bu_list = re.findall('<tr id.*?</tr>', con)
     for bo in bu_list:
         bu = Building(co_index)
         bu.co_id = co_id
         bu_id = re.search('GetData.*?,(.*?)\)', bo).group(1)
         bu.bu_id = bu_id.strip("'")
         try:
             bu.bu_num = re.search('预售证时间:.*?<td>(.*?)</td', bo).group(1)
             bu.bu_pre_sale = re.search('预售证号:(.*?)</td', bo).group(1)
             bu.bu_pre_sale_date = re.search('预售证时间:(.*?)</td', bo).group(1)
             bu.bu_all_house = re.search('预售证号:.*?<td>(\d+)</td',
                                         bo).group(1)
         except Exception as e:
             log.error("{}楼栋无预售号等信息{}".format(bo, e))
         bu.insert_db()
         self.house_parse(co_id, bu.bu_id)
Пример #28
0
 def get_build_info(self, build_info_list, bu_num_list, co_id, comm_url):
     for i in range(len(bu_num_list)):
         try:
             build = Building(co_index)
             build.co_id = co_id
             build.info = build_info_list[i]
             build.bu_num = bu_num_list[i]
             build.insert_db()
         except Exception as e:
             print('楼栋错误,co_index={},url={}'.format(co_index, comm_url), e)
Пример #29
0
    def analyzer_comm_url(self, comm_url_list):
        all_url = []
        for i in comm_url_list:
            try:
                res = requests.get(i)
                html = res.content.decode('gbk')
                c = Comm(self.co_index)
                c.co_name = re.search('项目名称:.*?">.*?<span.*?>(.*?)</span>',
                                      html, re.S | re.M).group(1)  # 项目名称
                c.co_address = re.search('项目地址:.*?">.*?<span.*?>(.*?)</span>',
                                         html, re.S | re.M).group(1)  # 项目地址
                c.co_develops = re.search('开发商:.*?">.*?<span.*?>(.*?)</span>',
                                          html, re.S | re.M).group(1)  # 开发商
                c.co_build_size = re.search(
                    '总建筑面积:.*?">.*?<span.*?>(.*?)</span>', html,
                    re.S | re.M).group(1)  # 建筑面积
                c.co_land_type = re.search(
                    '用地依据:.*?">.*?<span.*?>(.*?)</span>', html,
                    re.S | re.M).group(1)  # 土地使用证
                c.co_all_house = re.search(
                    '>总套数:.*?">.*?<span.*?>(.*?)</span>', html,
                    re.S | re.M).group(1)  # 总套数
                c.area = re.search('所在区域:.*?">.*?<span.*?>(.*?)</span>', html,
                                   re.S | re.M).group(1)  # 地区 area
                c.co_work_pro = re.search(
                    '施工许可证:.*?">.*?<span.*?>(.*?)</span>', html,
                    re.S | re.M).group(1)  # 施工许可证
                c.co_plan_pro = re.search(
                    '建设工程规划许可证:.*?">.*?<span.*?>(.*?)</span>', html,
                    re.S | re.M).group(1)  # 规划许可证
                c.insert_db()

                buildlist = re.findall('onmouseover.*?</TR>', html,
                                       re.S | re.M)
                url_list = []
                for k in buildlist:
                    try:
                        b = Building(self.co_index)
                        build_list = re.findall('<TD.*?>(.*?)</TD>', k,
                                                re.S | re.M)
                        b.co_name = build_list[1]
                        b.bu_num = build_list[2]
                        b.bu_type = build_list[4]
                        b.insert_db()
                        house_url = re.findall('href="(.*?)"', k, re.S | re.M)
                        for j in house_url:
                            url_list.append(
                                'http://www.stfcj.gov.cn/stsite/ProjectList/' +
                                j)
                    except Exception as e:
                        print('楼栋错误,co_index={},url={}'.format(co_index, i), e)
                all_url = all_url + url_list
            except Exception as e:
                print('小区错误,co_index={},url={}'.format(co_index, i), e)
        return all_url
Пример #30
0
 def get_comm_info(self, comm_url, comm):
     co_url = 'http://www.fangdi.com.cn/' + comm_url
     response = requests.get(co_url, headers=self.headers)
     html = response.content.decode('gbk')
     comm.co_develops = re.search('企业名称:.*?<a.*?>(.*?)<', html,
                                  re.S | re.M).group(1)
     comm.insert_db()
     add_build_url = 'http://www.fangdi.com.cn/Presell.asp?projectID=' + comm.co_id
     result = requests.get(add_build_url, headers=self.headers)
     html_str = result.content.decode('gbk')
     build_detail_tuple_list = re.findall(
         "javascript:SetSelect\(.*?,.*?,.*?,.*?,.*?,'(.*?)','(.*?)'\)",
         html_str, re.S | re.M)
     for i in build_detail_tuple_list:
         PreSell_ID = i[0]
         Start_ID = i[1]
         build_detail_url = 'http://www.fangdi.com.cn/building.asp?ProjectID=OTU4OHwyMDE4LTQtNHwxNw&PreSell_ID=' + PreSell_ID + '&Start_ID=' + Start_ID
         massage = requests.get(build_detail_url,
                                headers=self.headers).content.decode('gbk')
         build_url_list = re.findall('class="indextabletxt">.*?</tr>',
                                     massage, re.S | re.M)
         for i in build_url_list:
             try:
                 build = Building(co_index)
                 build.bu_num = re.search('<a.*?>(.*?)</a>', i,
                                          re.S | re.M).group(1)
                 build.bu_all_house = re.search(
                     '<a.*?<td.*?<td.*?<td.*?>(.*?)<', i,
                     re.S | re.M).group(1)
                 build.bu_build_size = re.search(
                     '<a.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', i,
                     re.S | re.M).group(1)
                 build.bu_id = re.search('Param=(.*?)=', i,
                                         re.S | re.M).group(1)
                 build.co_id = comm.co_id
                 build.insert_db()
                 house_url = re.search('href="(.*?)"', i,
                                       re.S | re.M).group(1)
                 self.get_house_info(house_url, build.bu_id, build.co_id)
             except Exception as e:
                 print(
                     '楼栋错误,co_index={},url={}'.format(
                         co_index, build_detail_url), e)