Exemplo n.º 1
0
 def comm_detail(self, comm_url_list, city):
     for comm_url in comm_url_list[1:]:
         com_url = city.replace('/xiaoqu/', comm_url)
         statecode = re.search('xq-(.*)', comm_url).group(1)
         # R S 两种不同的接口 S代表出售 R代表出租 这里用S
         comm_detail_url = 'http://sh.centanet.com/apipost/GetDealRecord?estateCode=' + statecode + '&posttype=S&pageindex=1&pagesize=10000'
         try:
             com_res = requests.get(url=com_url,
                                    headers=self.headers,
                                    proxies=self.proxies)
         except Exception as e:
             log.error('source={}, 请求失败 url={} e={}'.format(
                 '中原地产', com_url, e))
             continue
         try:
             res = requests.get(url=comm_detail_url,
                                headers=self.headers,
                                proxies=self.proxies)
         except Exception as e:
             log.error('source={}, 请求失败 url={} e={}'.format(
                 '中原地产', comm_detail_url, e))
             continue
         html = etree.HTML(com_res.text)
         try:
             data_dict = json.loads(res.text)
         except Exception as e:
             log.error('source={}, 序列化失败 e={}'.format('中原地产', e))
             continue
         try:
             district_name = html.xpath("//div/h3/text()")[0]
             city_name = html.xpath(
                 "//div[@class='idx-city']/text()")[0].replace(
                     '\n', '').replace('\t', '').replace(' ', '')
             region = html.xpath("//a[@class='f000']/text()")[0].replace(
                 '\n', '').replace('\t', '').replace(' ', '')
         except Exception as e:
             log.error('source={}, 区域解析失败 e={}'.format('中原地产', e))
             continue
         for data in data_dict["result"]:
             co = Base(source)
             # 小区名称
             co.district_name = district_name.strip()
             # 区域
             co.region = region
             # 城市
             co.city = city_name
             try:
                 room_type = data["houseType"]
                 # 室数
                 co.room = int(
                     re.search('(\d)室', room_type, re.S | re.M).group(1))
             except Exception as e:
                 co.room = None
                 log.error('source={}, room为空 e={}'.format('中原地产', e))
             try:
                 room_type = data["houseType"]
                 # 厅数
                 co.hall = int(
                     re.search('(\d)厅', room_type, re.S | re.M).group(1))
             except Exception as e:
                 co.hall = None
                 log.error('source={}, hall e={}'.format('中原地产', e))
             # 面积
             area = data['areaSize'].replace('平', '')
             if area:
                 area = float(area)
                 co.area = round(area, 2)
             # 朝向
             co.direction = data['direction']
             # 交易时间
             trade_date = '20' + data['dealTime']
             if trade_date:
                 t = time.strptime(trade_date, "%Y-%m-%d")
                 y = t.tm_year
                 m = t.tm_mon
                 d = t.tm_mday
                 co.trade_date = co.local2utc(datetime.datetime(y, m, d))
             try:
                 # 均价
                 avg_price = data['unitPrice']
                 avg_price = int(
                     float(
                         re.search('(\d+\.?\d+)', avg_price,
                                   re.S | re.M).group(1)) * 10000)
                 co.avg_price = avg_price
             except:
                 co.avg_price = None
             # 总价
             # total_price = data['dealPrice']
             # co.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000
             try:
                 co.total_price = int(int(co.avg_price) * float(co.area))
             except:
                 co.total_price = None
             co.url = comm_detail_url
             co.insert_db()
Exemplo n.º 2
0
    def parse(self, room_url, co_name, region, city_name):
        try:
            page_index = requests.get(url=room_url,
                                      headers=self.headers,
                                      proxies=self.proxies)
        except Exception as e:
            log.error('请求错误, source="{}",url="{}",e="{}"'.format(
                '新浪乐居', room_url, e))
            return
        if re.search('共(\d+)页', page_index.text):
            page_num = re.search('共(\d+)页', page_index.text).group(1)
            for i in range(1, int(page_num) + 1):
                url = re.sub('#.*', 'n', room_url) + str(i)
                while True:
                    try:
                        res = requests.get(url=url,
                                           headers=self.headers,
                                           proxies=self.proxies)
                        break
                    except Exception as e:
                        log.error('请求错误, source="{}",url="{}",e="{}"'.format(
                            '新浪乐居', url, e))
                        continue
                con = res.text
                room_html = etree.HTML(con)
                room_list = room_html.xpath(
                    "//div[@class='right-information']")
                for m in room_list:
                    room = Base(source)
                    room.url = url
                    # 小区名
                    room.district_name = co_name
                    # 城市
                    room.city = city_name
                    # 区域
                    room.region = region
                    room_type = m.xpath("./h3/span[2]/text()")[0]
                    try:
                        # 室
                        room.room = int(
                            re.search('(\d)室', room_type,
                                      re.S | re.M).group(1))
                    except:
                        room.room = None
                    try:
                        # 厅
                        room.hall = int(
                            re.search('(\d)厅', room_type,
                                      re.S | re.M).group(1))
                    except:
                        room.hall = None
                    # 面积
                    size = m.xpath("./h3/span[3]/text()")[0]
                    area = size.replace('平米', '')
                    if area:
                        area = float(area)
                        room.area = round(area, 2)
                    # 总价
                    # total_price = m.xpath(".//div[@class='price fs14 ']/em/text()")[0]
                    # room.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1))*10000
                    # 均价
                    avg_price = m.xpath(
                        ".//div[@class='size  fs14']/text()")[0]
                    room.avg_price = int(
                        re.search('(\d+)', avg_price, re.S | re.M).group(1))
                    try:
                        room.total_price = int(
                            int(room.avg_price) * float(room.area))
                    except:
                        room.total_price = None
                    try:
                        fitment_direction_info = m.xpath(
                            ".//div[@class='t1 fs14']")[0]
                        fitment_direction_info = fitment_direction_info.xpath(
                            'string(.)')
                        fitment_direction_info = fitment_direction_info.split(
                            '|')
                        if len(fitment_direction_info) == 2:
                            room.fitment = fitment_direction_info[1]
                            room.direction = fitment_direction_info[0]
                        elif len(fitment_direction_info) == 3:
                            room.fitment = fitment_direction_info[2]
                            room.direction = fitment_direction_info[1]
                    except:
                        room.fitment = None
                        room.direction = None

                    floor_info = m.xpath(".//div[@class='fs14']/text()[1]")[0]
                    try:
                        floor = re.search('(.*?)/', floor_info).group(1)
                        room.floor = int(re.search('\d+', floor).group(0))
                    except Exception as e:
                        room.floor = None
                    try:
                        room.height = int(
                            re.search('.*?/(\d+)层', floor_info).group(1))
                    except:
                        room.height = None
                    trade_date = m.xpath(".//div[@class='date']/text()")[0]
                    if trade_date:
                        t = time.strptime(trade_date, "%Y-%m-%d")
                        y = t.tm_year
                        m = t.tm_mon
                        d = t.tm_mday
                        room.trade_date = room.local2utc(
                            datetime.datetime(y, m, d))
                    room.insert_db()
        else:
            log.info('source={}, url={}, 小区无相关数据'.format('新浪乐居', room_url))
            return
Exemplo n.º 3
0
    def parse(self, res, com_url):
        co = Base(source)
        co.url = com_url
        tree = etree.HTML(res.text)
        # 小区名称
        district_name = tree.xpath(
            "//dl[@class='fl roominfor']/dd/h2/text()")[0].replace(' ', '')
        co.district_name = district_name
        # 城市
        city = tree.xpath("/html/body/div[3]/div/a[1]/text()")[0].replace(
            '中原地产', '')
        co.city = city
        # 区域
        region = tree.xpath("/html/body/div[3]/div/a[3]/text()")[0].replace(
            '小区', '')
        co.region = region
        info_list = tree.xpath(
            "//div[@class='tablerecord-list']/div[@class='tablerecond-item']")
        for info in info_list:

            # 室数
            try:
                room_type = info.xpath("./a/span[1]/text()")[0]
                room = int(re.search('(\d)室', room_type, re.S | re.M).group(1))
                co.room = room
            except:
                co.room = None
            try:
                # 厅数
                room_type = info.xpath("./a/span[1]/text()")[0]
                hall = int(re.search('(\d)厅', room_type, re.S | re.M).group(1))
                co.hall = hall
            except:
                co.hall = None
            # 朝向
            try:
                direction = info.xpath("./a/span[2]/text()")[0].replace(
                    ' ', '').replace('\n', '').replace('\t', '')
                co.direction = direction
            except:
                co.direction = None
            try:
                # 面积
                area = info.xpath('./a/span[3]/text()')[0].replace('平', '')
                area = round(float(area), 2)
                co.area = area
            except:
                co.area = None
            # 交易时间
            try:
                trade_date = info.xpath('./a/span[4]/text()')[0]
                t = time.strptime(trade_date, "%Y/%m/%d")
                y = t.tm_year
                m = t.tm_mon
                d = t.tm_mday
                co.trade_date = co.local2utc(datetime.datetime(y, m, d))
            except:
                co.trade_date = None
            # # 总价
            # try:
            #     total_price = info.xpath("./a/span[5]/text()")[0]
            #     total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000
            #     co.total_price = total_price
            # except:
            #     co.total_price = None
            # 均价
            try:
                avg_price = info.xpath("./a/span[6]/text()")[0]
                avg_price = int(avg_price.replace('元/平', ''))
                co.avg_price = avg_price
            except:
                co.avg_price = None
            # 总价
            try:
                co.total_price = int(int(co.avg_price) * float(co.area))
            except:
                co.total_price = None
            co.insert_db()
 def final_parse(self, data):
     final_url = data['link']
     city = data['city']
     region = data['region']
     try:
         r = requests.get(url=final_url,
                          headers=self.headers,
                          proxies=self.proxies,
                          timeout=60)
     except Exception as e:
         log.error('请求失败, source={}, 没有更多小区成交 url={}, e={}'.format(
             '链家在线', final_url, e))
         return
     tree = etree.HTML(r.text)
     url_list = tree.xpath("//ul[@class='listContent']/li")
     if url_list:
         for info in url_list:
             comm = Base('链家在线')
             comm.url = final_url
             # 区域
             comm.region = region.strip()
             # 城市
             comm.city = city.strip()
             district_name_room_area = info.xpath(
                 "./div/div[@class='title']/a/text()")[0]
             # 小区名称
             comm.district_name = district_name_room_area.split(' ')[0]
             try:
                 room_hall = district_name_room_area.split(' ')[1]
             except:
                 room_hall = None
             try:
                 # 室
                 comm.room = int(
                     re.search('(\d+)室', room_hall, re.S | re.M).group(1))
             except:
                 comm.room = None
             try:
                 # 厅
                 comm.hall = int(
                     re.search('(\d+)厅', room_hall, re.S | re.M).group(1))
             except:
                 comm.hall = None
             try:
                 # 面积
                 area = district_name_room_area.split(' ')[2]
                 area = re.search("(.*?)平米", area, re.S | re.M).group(1)
                 comm.area = round(float(area), 2)
             except:
                 comm.area = None
             try:
                 direction_fitment = info.xpath(
                     "./div/div[@class='address']/div[1]/text()")[0].split(
                         '|')
                 # 朝向
                 comm.direction = direction_fitment[0]
                 # 装修
                 comm.fitment = direction_fitment[1]
             except:
                 comm.direction = None
                 comm.fitment = None
             # 总楼层
             try:
                 height = info.xpath(
                     "./div/div[@class='flood']/div[1]/text()")[0]
                 comm.height = int(
                     re.search("共(\d+)层", height, re.S | re.M).group(1))
             except:
                 comm.height = None
             # # 总价
             # try:
             #     total_price = info.xpath("./div/div[@class='address']/div[3]/span/text()")[0]
             #     if "*" in total_price:
             #         log.error('source={}, 总价有问题 带*号'.format('链家在线'))
             #         continue
             #     else:
             #         comm.total_price = int(total_price) * 10000
             # except:
             #     comm.total_price = None
             # 交易时间
             try:
                 trade_date = info.xpath(
                     "./div/div[@class='address']/div[2]/text()")[0]
                 t = time.strptime(trade_date, "%Y.%m.%d")
                 y = t.tm_year
                 m = t.tm_mon
                 d = t.tm_mday
                 comm.trade_date = comm.local2utc(datetime.datetime(
                     y, m, d))
             except:
                 comm.trade_date = None
             # 均价
             try:
                 avg_price = info.xpath(
                     "./div/div[@class='flood']/div[3]/span/text()")[0]
                 comm.avg_price = int(avg_price)
             except:
                 comm.avg_price = None
             try:
                 comm.total_price = int(
                     int(comm.avg_price) * float(comm.area))
             except:
                 comm.total_price = None
             comm.insert_db()
Exemplo n.º 5
0
 def parse(self, url, city):
     try:
         response = requests.get(url=url,
                                 headers=self.headers,
                                 proxies=self.proxies)
     except Exception as e:
         log.error('请求失败,source="{}", url="{}",e="{}"'.format(
             '房天下', url, e))
         return
     tree = etree.HTML(response.text)
     info_list = tree.xpath("//div[@class='houseList']/dl")
     comm = Base('房天下')
     comm.url = url
     comm.city = city
     for info in info_list:
         district_name_info = info.xpath("./dd/p/a/text()")[0]
         # 小区名称
         comm.district_name = district_name_info.split(' ')[0]
         if '�' in comm.district_name:
             log.error('source={}, 网页出现繁体字, url={}'.format('房天下', url))
             break
         # 室
         try:
             comm.room = int(
                 re.search('(\d+)室', district_name_info,
                           re.S | re.M).group(1))
         except Exception as e:
             comm.room = None
         # 厅
         try:
             comm.hall = int(
                 re.search('(\d+)厅', district_name_info,
                           re.S | re.M).group(1))
         except Exception as e:
             comm.hall = None
         # 面积
         try:
             comm.area = float(
                 re.search('(\d+\.?\d+?)平米', district_name_info,
                           re.S | re.M).group(1))
         except Exception as e:
             comm.area = None
         # 区域
         try:
             region_info = info.xpath("./dd/p[2]/text()")[0]
             comm.region = region_info.split('-')[0]
         except Exception as e:
             comm.region = None
         # 朝向 总楼层
         try:
             direction_info = info.xpath("./dd/p[3]")[0]
             direction_info = direction_info.xpath('string(.)')
             comm.direction = direction_info.split('|')[0]
             comm.height = int(
                 re.search('\(共(.*?)层\)', direction_info,
                           re.S | re.M).group(1))
         except Exception as e:
             comm.direction = None
             comm.height = None
         # 时间
         try:
             trade_date = info.xpath("./dd/div[2]/p[1]/text()")[0]
             t = time.strptime(trade_date, "%Y-%m-%d")
             y = t.tm_year
             m = t.tm_mon
             d = t.tm_mday
             comm.trade_date = datetime.datetime(y, m, d)
         except Exception as e:
             comm.trade_date = None
         # 总价
         try:
             total_price = info.xpath("./dd/div[3]/p[1]/span[1]/text()")[0]
             comm.total_price = int(total_price) * 10000
         except Exception as e:
             comm.total_price = None
         # 均价
         try:
             avg_price_info = info.xpath("./dd/div[3]/p[2]/b[1]/text()")[0]
             comm.avg_price = int(
                 re.search("(\d+)元", avg_price_info, re.S | re.M).group(1))
         except Exception as e:
             comm.avg_price = None
         comm.insert_db()
Exemplo n.º 6
0
    def crawler(self, city_url, city):
        print(city_url)
        try:
            res = requests.get(url=city_url, headers=self.headers, proxies=self.proxies)
        except Exception as e:
            log.error('请求错误,source="{}",url="{}",e="{}"'.format('麦田', city_url, e))
            return
        con = etree.HTML(res.text)
        try:
            last_page = con.xpath("//a[@class='down_page']/@href")[1]
            page_num = re.search('\d+', last_page).group(0)
        except Exception as e:
            log.error('获取页码失败,source="{}",url="{}",e="{}"'.format('麦田', city_url, e))
            return
        for i in range(1, int(page_num) + 1):
            page_url = city_url + "/PG" + str(i)
            try:
                page_res = requests.get(url=page_url, headers=self.headers, proxies=self.proxies)
            except Exception as e:
                log.error('请求错误,source="{}",url="{}",e="{}"'.format('麦田', page_url, e))
                continue
            page_con = etree.HTML(page_res.text)
            temp = page_con.xpath("//h1/a/@href")
            for temp_url in temp:
                com = Base(source)
                comm_url = city + temp_url
                com.url = comm_url
                try:
                    co_res = requests.get(url=comm_url, headers=self.headers, proxies=self.proxies)
                except Exception as e:
                    log.error('请求错误,source="{}",url="{}",e="{}"'.format('麦田', comm_url, e))
                    continue

                co_con = etree.HTML(co_res.text)
                # 城市
                try:
                    com.city = co_con.xpath("//div/a[@class='show']/text()")[0]
                    # 区域
                    region = co_con.xpath("//section[@class='fl home_main']/p[3]/a/text()")[-1]
                    com.region = re.search("\[(.*)\]", region, re.S | re.M).group(1)
                    # 小区名称
                    com.district_name = co_con.xpath("//cite/span/text()")[0]
                    info = co_con.xpath("//table/tbody/tr")
                except Exception as e:
                    log.error('获取城市区域小区名失败, source="{}",url="{}",e="{}"'.format('麦田', comm_url, e))
                    continue
                for tag in info:
                    size = tag.xpath("./td[2]/text()")[0]
                    area = size.replace('㎡', '')
                    area = float(area)
                    # 面积
                    com.area = round(area, 2)
                    # 均价
                    avg_price = tag.xpath("./td[3]/text()")[0]
                    com.avg_price = int(re.search('(\d+)', avg_price, re.S | re.M).group(1))
                    # # 总价
                    # total_price = tag.xpath("./td/span/text()")[0]
                    # com.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000
                    try:
                        com.total_price = int(int(com.avg_price)*float(com.area))
                    except:
                        com.total_price = None
                    # 成交日期
                    trade_date = tag.xpath("./td/text()")[-2]
                    if trade_date:
                        t = time.strptime(trade_date, "%Y-%m-%d")
                        y = t.tm_year
                        m = t.tm_mon
                        d = t.tm_mday
                        com.trade_date = com.local2utc(datetime.datetime(y, m, d))
                    room_type = tag.xpath("./td//p/a/text()")[0]
                    try:
                        # 室
                        com.room = int(re.search('(\d)室', room_type, re.S | re.M).group(1))
                    except:
                        com.room = None
                    try:
                        # 厅
                        com.hall = int(re.search('(\d)厅', room_type, re.S | re.M).group(1))
                    except:
                        com.hall = None
                    # 总楼层
                    floor = tag.xpath("./td//p/span/text()")[0]
                    com.floor = int(re.search('(\d+)层', floor, re.S | re.M).group(1))
                    # 朝向
                    com.direction = floor.split(' ')[1]
                    com.insert_db()
Exemplo n.º 7
0
 def get_detail(self, response, city, region, url):
     html = response.text
     tree = etree.HTML(html)
     info_list = tree.xpath("//div[@class='house-detail']/ul/li")
     for info in info_list:
         comm = Base('Q房网')
         # 链接
         comm.url = url
         # 城市
         comm.city = city.strip()
         # 区域
         comm.region = region.strip()
         district_name_room_area = info.xpath("./div[1]/p[1]/a[1]/text()")[0]
         # 小区名称
         comm.district_name = district_name_room_area.split(' ')[0]
         # 室
         try:
             comm.room = int(re.search("(\d+)室", district_name_room_area, re.S | re.M).group(1))
         except:
             comm.room = None
         # 厅
         try:
             comm.hall = int(re.search("(\d+)厅", district_name_room_area, re.S | re.M).group(1))
         except:
             comm.hall = None
         # 面积
         try:
             area = re.search("(\d+.?\d+?)平米", district_name_room_area, re.S | re.M).group(1)
             comm.area = round(float(area), 2)
         except:
             comm.area = None
         # 朝向 总楼层
         try:
             direction = info.xpath("./div[1]/p[2]/span[4]/text()")[0]
             if '层' not in direction:
                 comm.direction = direction
                 height = info.xpath("./div[1]/p[2]/span[6]/text()")[0]
                 comm.height = int(re.search("(\d+)层", height, re.S | re.M).group(1))
             else:
                 comm.direction = None
                 comm.height = int(re.search("(\d+)层", direction, re.S | re.M).group(1))
         except:
             comm.direction = None
             comm.height = None
         # # 总价
         # try:
         #     total_price = info.xpath("./div[2]/span[1]/text()")[0]
         #     comm.total_price = int(total_price) * 10000
         # except:
         #     comm.total_price = None
         # 均价
         try:
             avg_price = info.xpath("./div[2]/p[1]/text()")[0]
             comm.avg_price = int(re.search("\d+", avg_price, re.S | re.M).group(0))
         except:
             comm.avg_price = None
         # 总价
         try:
             comm.total_price = int(int(comm.avg_price)*float(comm.area))
         except:
             comm.total_price = None
         # 交易时间
         try:
             trade_date = info.xpath("./div[3]/span[1]/text()")[0]
             t = time.strptime(trade_date, "%Y.%m.%d")
             y = t.tm_year
             m = t.tm_mon
             d = t.tm_mday
             comm.trade_date = comm.local2utc(datetime.datetime(y, m, d))
         except:
             comm.trade_date = None
         comm.insert_db()