Exemplo n.º 1
0
    def parse(self, room_url, co_name, region, city_name):
        try:
            page_index = requests.get(url=room_url,
                                      headers=self.headers,
                                      proxies=self.proxies)
        except Exception as e:
            log.error('请求错误, source="{}",url="{}",e="{}"'.format(
                '新浪乐居', room_url, e))
            return
        if re.search('共(\d+)页', page_index.text):
            page_num = re.search('共(\d+)页', page_index.text).group(1)
            for i in range(1, int(page_num) + 1):
                url = re.sub('#.*', 'n', room_url) + str(i)
                while True:
                    try:
                        res = requests.get(url=url,
                                           headers=self.headers,
                                           proxies=self.proxies)
                        break
                    except Exception as e:
                        log.error('请求错误, source="{}",url="{}",e="{}"'.format(
                            '新浪乐居', url, e))
                        continue
                con = res.text
                room_html = etree.HTML(con)
                room_list = room_html.xpath(
                    "//div[@class='right-information']")
                for m in room_list:
                    room = Base(source)
                    room.url = url
                    # 小区名
                    room.district_name = co_name
                    # 城市
                    room.city = city_name
                    # 区域
                    room.region = region
                    room_type = m.xpath("./h3/span[2]/text()")[0]
                    try:
                        # 室
                        room.room = int(
                            re.search('(\d)室', room_type,
                                      re.S | re.M).group(1))
                    except:
                        room.room = None
                    try:
                        # 厅
                        room.hall = int(
                            re.search('(\d)厅', room_type,
                                      re.S | re.M).group(1))
                    except:
                        room.hall = None
                    # 面积
                    size = m.xpath("./h3/span[3]/text()")[0]
                    area = size.replace('平米', '')
                    if area:
                        area = float(area)
                        room.area = round(area, 2)
                    # 总价
                    # total_price = m.xpath(".//div[@class='price fs14 ']/em/text()")[0]
                    # room.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1))*10000
                    # 均价
                    avg_price = m.xpath(
                        ".//div[@class='size  fs14']/text()")[0]
                    room.avg_price = int(
                        re.search('(\d+)', avg_price, re.S | re.M).group(1))
                    try:
                        room.total_price = int(
                            int(room.avg_price) * float(room.area))
                    except:
                        room.total_price = None
                    try:
                        fitment_direction_info = m.xpath(
                            ".//div[@class='t1 fs14']")[0]
                        fitment_direction_info = fitment_direction_info.xpath(
                            'string(.)')
                        fitment_direction_info = fitment_direction_info.split(
                            '|')
                        if len(fitment_direction_info) == 2:
                            room.fitment = fitment_direction_info[1]
                            room.direction = fitment_direction_info[0]
                        elif len(fitment_direction_info) == 3:
                            room.fitment = fitment_direction_info[2]
                            room.direction = fitment_direction_info[1]
                    except:
                        room.fitment = None
                        room.direction = None

                    floor_info = m.xpath(".//div[@class='fs14']/text()[1]")[0]
                    try:
                        floor = re.search('(.*?)/', floor_info).group(1)
                        room.floor = int(re.search('\d+', floor).group(0))
                    except Exception as e:
                        room.floor = None
                    try:
                        room.height = int(
                            re.search('.*?/(\d+)层', floor_info).group(1))
                    except:
                        room.height = None
                    trade_date = m.xpath(".//div[@class='date']/text()")[0]
                    if trade_date:
                        t = time.strptime(trade_date, "%Y-%m-%d")
                        y = t.tm_year
                        m = t.tm_mon
                        d = t.tm_mday
                        room.trade_date = room.local2utc(
                            datetime.datetime(y, m, d))
                    room.insert_db()
        else:
            log.info('source={}, url={}, 小区无相关数据'.format('新浪乐居', room_url))
            return
 def final_parse(self, data):
     final_url = data['link']
     city = data['city']
     region = data['region']
     try:
         r = requests.get(url=final_url,
                          headers=self.headers,
                          proxies=self.proxies,
                          timeout=60)
     except Exception as e:
         log.error('请求失败, source={}, 没有更多小区成交 url={}, e={}'.format(
             '链家在线', final_url, e))
         return
     tree = etree.HTML(r.text)
     url_list = tree.xpath("//ul[@class='listContent']/li")
     if url_list:
         for info in url_list:
             comm = Base('链家在线')
             comm.url = final_url
             # 区域
             comm.region = region.strip()
             # 城市
             comm.city = city.strip()
             district_name_room_area = info.xpath(
                 "./div/div[@class='title']/a/text()")[0]
             # 小区名称
             comm.district_name = district_name_room_area.split(' ')[0]
             try:
                 room_hall = district_name_room_area.split(' ')[1]
             except:
                 room_hall = None
             try:
                 # 室
                 comm.room = int(
                     re.search('(\d+)室', room_hall, re.S | re.M).group(1))
             except:
                 comm.room = None
             try:
                 # 厅
                 comm.hall = int(
                     re.search('(\d+)厅', room_hall, re.S | re.M).group(1))
             except:
                 comm.hall = None
             try:
                 # 面积
                 area = district_name_room_area.split(' ')[2]
                 area = re.search("(.*?)平米", area, re.S | re.M).group(1)
                 comm.area = round(float(area), 2)
             except:
                 comm.area = None
             try:
                 direction_fitment = info.xpath(
                     "./div/div[@class='address']/div[1]/text()")[0].split(
                         '|')
                 # 朝向
                 comm.direction = direction_fitment[0]
                 # 装修
                 comm.fitment = direction_fitment[1]
             except:
                 comm.direction = None
                 comm.fitment = None
             # 总楼层
             try:
                 height = info.xpath(
                     "./div/div[@class='flood']/div[1]/text()")[0]
                 comm.height = int(
                     re.search("共(\d+)层", height, re.S | re.M).group(1))
             except:
                 comm.height = None
             # # 总价
             # try:
             #     total_price = info.xpath("./div/div[@class='address']/div[3]/span/text()")[0]
             #     if "*" in total_price:
             #         log.error('source={}, 总价有问题 带*号'.format('链家在线'))
             #         continue
             #     else:
             #         comm.total_price = int(total_price) * 10000
             # except:
             #     comm.total_price = None
             # 交易时间
             try:
                 trade_date = info.xpath(
                     "./div/div[@class='address']/div[2]/text()")[0]
                 t = time.strptime(trade_date, "%Y.%m.%d")
                 y = t.tm_year
                 m = t.tm_mon
                 d = t.tm_mday
                 comm.trade_date = comm.local2utc(datetime.datetime(
                     y, m, d))
             except:
                 comm.trade_date = None
             # 均价
             try:
                 avg_price = info.xpath(
                     "./div/div[@class='flood']/div[3]/span/text()")[0]
                 comm.avg_price = int(avg_price)
             except:
                 comm.avg_price = None
             try:
                 comm.total_price = int(
                     int(comm.avg_price) * float(comm.area))
             except:
                 comm.total_price = None
             comm.insert_db()