예제 #1
0
def parse_price(number_string: str):
    #87, 社會住宅's monthly_price is a range
    tokens = number_string.split('~')
    price = clean_number(tokens[0])
    ret = {'monthly_price': price}
    if len(tokens) >= 2:
        ret['min_monthly_price'] = clean_number(tokens[1])

    return ret
예제 #2
0
    def get_target_value(self, house, target_config):
        the_path = target_config['path']
        section = the_path[0]
        child_path = the_path[1:]
        cursor = getattr(house, section)

        for step in child_path:
            if isinstance(cursor, dict) and cursor.get(step):
                cursor = cursor.get(step)
            else:
                cursor = None
                break

        if target_config.get('fn'):
            cursor = target_config.get('fn')(house)
        elif isinstance(cursor, dict):
            cursor = json.dumps(cursor, ensure_ascii=False)
        elif target_config.get('clean_number'):
            try:
                cursor = clean_number(cursor)
            except ValueError:
                logging.error(f'Invalid number: {cursor}')
                cursor = None

        return cursor
예제 #3
0
    def gen_detail_shared_attrs(self, detail_dict):
        detail_dict['price'] = clean_number(detail_dict['price'])

        detail_dict['price_includes'] = list(
            map(lambda x: x.replace('含', ''), detail_dict['price_includes']))

        if '生活機能' in detail_dict['environment']:
            detail_dict['environment']['生活機能'] = list(
                map(lambda x: x.replace('近', ''),
                    detail_dict['environment']['生活機能']))

        if '附近交通' in detail_dict['environment']:
            detail_dict['environment']['附近交通'] = list(
                map(lambda x: re.sub('[  ]', '', x.replace('近', '')),
                    detail_dict['environment']['附近交通']))

        basic_info = self.get_shared_basic(detail_dict)
        price_info = self.get_shared_price(detail_dict, basic_info)
        env_info = self.get_shared_environment(detail_dict)
        boolean_info = self.get_shared_boolean_info(detail_dict)
        misc_info = self.get_shared_misc(detail_dict)

        ret = {
            'vendor': self.vendor,
            'vendor_house_id': detail_dict['house_id'],
            'monthly_price': detail_dict['price'],
            'imgs': detail_dict['imgs'],
            **price_info,
            **basic_info,
            **env_info,
            **boolean_info,
            **misc_info
        }

        return ret
예제 #4
0
def get_list_val(house, regular_attr, top_attr=None, to_number=False):
    ret = None

    if regular_attr in house:
        ret = house[regular_attr]
    elif top_attr in house:
        ret = house[top_attr]

    if to_number and ret is not None:
        ret = clean_number(ret)

    return ret
예제 #5
0
def parse_dealtime(row: House):
    base_time = row.created_at
    # example: 17天成交(1)
    days_taken = row.list_meta.get('addInfo').split('天')[0]
    # example: 今日、昨日、x天前
    last_updated = row.list_meta.get('posttime')

    try:
        days_taken = clean_number(days_taken)
    except ValueError:
        return None

    if days_taken is None:
        return None

    if last_updated == '今日':
        pass
    elif last_updated == '昨日':
        base_time -= timedelta(days=1)
    else:
        base_time -= timedelta(days=clean_number(last_updated))

    return base_time.astimezone(taipei_time).strftime('%Y-%m-%d')
예제 #6
0
    def count_and_parse_list(self, response):
        meta = response.meta['rental']
        data = json.loads(response.text)

        if meta.page == 0:
            count = clean_number(data['records'])
            logging.info(f'[{meta.name}] total {count} house to crawl!')

            # #items return per request may differ from API endpoint
            self.N_PAGE = len(data['data']['data'])

            # generate all list request as now we know number of result
            cur_page = 1
            while cur_page * self.N_PAGE < count:
                yield self.gen_list_request(
                    util.ListRequestMeta(meta.id, meta.name, cur_page))
                cur_page += 1

        houses = data['data']['data']

        if not self.novip:
            houses = data['data']['topData'] + houses

        for house in houses:
            # copy from twrh
            house['is_vip'] = 'id' not in house
            house_item = self.gen_shared_attrs(house, meta)

            stats, created = HouseStats.get_or_create(
                job_id=self.job.id, house_id=house_item['vendor_house_id'])

            if not created:
                continue

            yield RawHouseItem(house_id=house_item['vendor_house_id'],
                               vendor=self.vendor,
                               is_list=True,
                               raw=json.dumps(house, ensure_ascii=False))
            yield GenericHouseItem(**house_item)
            yield self.gen_detail_request(
                util.DetailRequestMeta(house_item['vendor_house_id'], False))
예제 #7
0
    def default_parse_list(self, response):
        data = json.loads(response.text)
        count = clean_number(data['records'])
        meta = response.meta['rental']

        if meta.page == 0:
            # generate all list request as now we know number of result
            cur_page = 1
            while cur_page * self.N_PAGE < count:
                yield self.gen_list_request(
                    ListRequestMeta(meta.id, meta.name, cur_page))
                cur_page += 1

        houses = data['data']['topData'] + data['data']['data']

        for house in houses:
            house_item = self.gen_shared_attrs(house, meta)
            yield RawHouseItem(house_id=house_item['vendor_house_id'],
                               vendor=self.vendor,
                               is_list=True,
                               raw=json.dumps(house, ensure_ascii=False))
            yield GenericHouseItem(**house_item)
            yield self.gen_detail_request(
                DetailRequestMeta(house_item['vendor_house_id']))
예제 #8
0
    def gen_shared_attrs(self, house, meta: ListRequestMeta):
        house_id = get_list_val(house, 'id', 'post_id')

        url = "{}/v1/house/rent/detail?id={}".format(API_URL, house_id)

        if 'region_name' in house:
            # topData doesn't contain region_name for some reason..
            top_region = self.get_enum(TopRegionType, house_id,
                                       house['region_name'])
        else:
            top_region = self.get_enum(TopRegionType, house_id, meta.name)

        sub_region = self.get_enum(
            SubRegionType, house_id, '{}{}'.format(
                TopRegionType(top_region).name,
                get_list_val(house, 'section_name', 'section_str')))

        property_type = None
        if 'kind_name' in house:
            self.get_enum(PropertyType, house_id,
                          get_list_val(house, 'kind_name'))

        floor = None
        total_floor = None
        if 'floor_str' in house:
            floor_info = house['floor_str'].split('/')
            if len(floor_info) >= 2:
                floor = clean_number(floor_info[0])
                total_floor = clean_number(floor_info[1])

                if floor == '頂樓加蓋':
                    floor = total_floor + 1
                elif 'B' in floor_info[0] and floor:
                    # basement
                    floor = -floor
                elif floor is None:
                    # 整棟
                    floor = 0

        price_range = parse_price(get_list_val(house, 'price'))

        generic_house = {
            'vendor': self.vendor,
            'vendor_house_id': house_id,
            'vendor_house_url': url,
            'imgs': get_list_val(house, 'photo_list'),
            'top_region': top_region,
            'sub_region': sub_region,
            'property_type': property_type,
            'floor_ping': clean_number(house['area']),
            'floor': floor,
            'total_floor': total_floor,
            **price_range
        }

        # 99 and 100 are magic number in 591...
        # https://github.com/g0v/tw-rental-house-data/issues/11
        if generic_house['floor'] == 99:
            generic_house['floor'] = 0
        elif generic_house['floor'] == 100 and generic_house['total_floor']:
            generic_house['floor'] = generic_house['total_floor'] + 1

        empty_keys = []
        for key in generic_house:
            if generic_house[key] is None:
                empty_keys.append(key)

        for key in empty_keys:
            del generic_house[key]

        return generic_house
예제 #9
0
    def get_shared_basic(self, detail_dict):
        ret = {}

        # top_region, sub_region
        if 'top_region' in detail_dict:
            ret['top_region'] = self.get_enum(enums.TopRegionType,
                                              detail_dict['house_id'],
                                              detail_dict['top_region'])

            ret['sub_region'] = self.get_enum(
                enums.SubRegionType, detail_dict['house_id'],
                '{}{}'.format(detail_dict['top_region'],
                              detail_dict['sub_region']))

        if 'address' in detail_dict:
            ret['rough_address'] = detail_dict['address']

        # deal_status
        if detail_dict['is_deal']:
            # Issue #15, update only deal_status in crawler
            # let `syncstateful` to update the rest
            ret['deal_status'] = enums.DealStatusType.DEAL
        else:
            # Issue #14, always update deal status since item may be reopened
            ret['deal_status'] = enums.DealStatusType.OPENED

        # building_type, 公寓 / 電梯大樓 / 透天
        if '型態' in detail_dict['side_metas']:
            building_type = detail_dict['side_metas']['型態']
            if building_type == '別墅' or building_type == '透天厝':
                ret['building_type'] = enums.BuildingType.透天
            elif building_type == '住宅大樓':
                ret['building_type'] = enums.BuildingType.電梯大樓
            else:
                ret['building_type'] = self.get_enum(enums.BuildingType,
                                                     detail_dict['house_id'],
                                                     building_type)

        # property type
        if '現況' in detail_dict['side_metas']:
            ret['property_type'] = self.get_enum(
                enums.PropertyType, detail_dict['house_id'],
                detail_dict['side_metas']['現況'])

        # is_rooftop, floor, total_floor
        # TODO: use title to detect rooftop
        if '樓層' in detail_dict['side_metas']:
            # floor_info = 1F/2F or 頂樓加蓋/2F or 整棟/2F
            floor_info = detail_dict['side_metas']['樓層'].split('/')
            floor = clean_number(floor_info[0])
            ret['floor'] = 0
            ret['total_floor'] = clean_number(floor_info[1])
            ret['is_rooftop'] = False

            if floor_info[0] == '頂樓加蓋':
                ret['is_rooftop'] = True
                ret['floor'] = ret['total_floor'] + 1
            elif 'B' in floor_info[0] and floor:
                # basement
                ret['floor'] = -floor
            elif floor:
                ret['floor'] = floor

            ret['dist_to_highest_floor'] = ret['total_floor'] - ret['floor']

        if '坪數' in detail_dict['side_metas']:
            ret['floor_ping'] = clean_number(detail_dict['side_metas']['坪數'])

        if '格局' in detail_dict['side_metas']:
            apt_feature = detail_dict['side_metas']['格局']

            for name in self.apt_features:
                if self.apt_features[name] in apt_feature:
                    ret[name] = clean_number(
                        apt_feature[self.apt_features[name]])
                else:
                    ret[name] = 0

            ret['apt_feature_code'] = '{:02d}{:02d}{:02d}{:02d}'.format(
                ret['n_balcony'], ret['n_bath_room'], ret['n_bed_room'],
                ret['n_living_room'])

        # TODO: rough_address

        return ret
예제 #10
0
    def get_shared_price(self, detail_dict, basic_info):
        ret = {}

        # deposit_type, n_month_deposit
        if '押金' in detail_dict['top_metas']:
            deposit = detail_dict['top_metas']['押金']
            month_deposit = deposit.split('個月')
            if len(month_deposit) == 2:
                ret['deposit_type'] = enums.DepositType.月
                ret['n_month_deposit'] = self.from_zh_number(month_deposit[0])
                ret['deposit'] = ret['n_month_deposit'] * detail_dict['price']
            elif deposit.replace(',', '').isdigit():
                ret['deposit'] = clean_number(deposit)
                n_month = ret['deposit'] / detail_dict['price']
                ret['deposit_type'] = enums.DepositType.定額
                ret['n_month_deposit'] = n_month
            elif deposit == '面議':
                ret['deposit_type'] = enums.DepositType.面議
                ret['n_month_deposit'] = None
                ret['deposit'] = None
            else:
                ret['deposit_type'] = enums.DepositType.其他
                ret['n_month_deposit'] = None
                ret['deposit'] = None

        # is_remanagement_fee, monthly_management_fee
        if '管理費' in detail_dict['price_includes']:
            ret['is_require_management_fee'] = False
            ret['monthly_management_fee'] = 0
        elif '管理費' in detail_dict['top_metas']:
            mgmt_fee = detail_dict['top_metas']['管理費']
            # could be xxx元/月, --, -, !@$#$%...
            if '元/月' in mgmt_fee:
                ret['is_require_management_fee'] = True
                ret['monthly_management_fee'] = clean_number(mgmt_fee)
            else:
                ret['is_require_management_fee'] = False
                ret['monthly_management_fee'] = 0

        # *_parking*
        if '車 位' in detail_dict['top_metas']:
            parking_str = detail_dict['top_metas']['車 位']
            parking = clean_number(parking_str)

            ret['has_parking'] = True
            if parking:
                ret['is_require_parking_fee'] = True
                ret['monthly_parking_fee'] = parking
            elif '已含' in parking_str:
                ret['is_require_parking_fee'] = False
                ret['monthly_parking_fee'] = 0
            elif '費用另計' in parking_str:
                ret['is_require_parking_fee'] = True
                ret['monthly_parking_fee'] = 0
            elif parking_str == '無':
                ret['has_parking'] = False

        # per ping price
        if 'floor_ping' in basic_info:
            mgmt = ret.get('monthly_management_fee', 0)
            parking = ret.get('monthly_parking_fee', 0)
            price = detail_dict['price']
            total_price = price + mgmt + parking
            ret['per_ping_price'] = total_price / basic_info['floor_ping']

        return ret
예제 #11
0
    def collect_dict(self, response):
        # title
        title = self.css_first(response, '.houseInfoTitle', deep_text=True)

        # region 首頁/租屋/xx市/xx區
        breadcromb = self.css(response, '#propNav a', deep_text=True)
        if len(breadcromb) >= 4:
            if breadcromb[2] == '出租' and len(breadcromb) >= 5:
                # 首頁 > 店面 > 出租 > 台北市 > 大安區 > 台北市大安區安和路二段
                top_region = breadcromb[3]
                sub_region = breadcromb[4]
            else:
                # 首頁 > 租屋 > 台北市 > 大安區 > 獨立套房 > 20000-30000元 > 台北市大安區仁愛路四段50號
                top_region = breadcromb[2]
                sub_region = breadcromb[3]
        else:
            top_region = '__UNKNOWN__'
            sub_region = '__UNKNOWN__'

        # rough address
        address = self.css_first(response, '#propNav .addr', deep_text=True)

        # image, it's in a hidden input
        imgs = self.css_first(response,
                              '#hid_imgArr::attr(value)',
                              allow_empty=True).replace('"', '').split(',')

        if imgs[0] == "":
            imgs.pop(0)

        # top meta, including 押金, 法定用途, etc..
        top_meta_keys = self.css(response, '.labelList-1 .one', deep_text=True)
        top_meta_values = self.css(response,
                                   '.labelList-1 .two em',
                                   deep_text=True)
        top_metas = dict_from_tuple(top_meta_keys, top_meta_values)

        if '身份要求' in top_metas:
            top_metas['身份要求'] = top_metas['身份要求'].split('、')

        # facilities, including 衣櫃、沙發, etc..
        fa_status = self.css(response, '.facility li span::attr(class)')
        fa_text = self.css(response, '.facility li', deep_text=True)
        fa = []
        without_fa = []
        for index, key in enumerate(fa_text):
            if fa_status[index] != 'no':
                fa.append(key)
            else:
                without_fa.append(key)

        # environment
        # <p><strong>生活機能</strong>:近便利商店;傳統市場;夜市</p>
        env_keys = self.css(response, '.lifeBox > p strong', deep_text=True)
        env_desps = self.css(response, '.lifeBox > p', deep_text=True)
        env_desps = list(
            map(lambda desp: re.sub('.*:', '', desp).split(';'), env_desps))
        env = dict_from_tuple(env_keys, env_desps)

        # neighbor
        nei_selector = response.css('.lifeBox.community')
        nei = {}
        if nei_selector:
            nei['name'] = self.css_first(nei_selector,
                                         '.communityName a',
                                         deep_text=True)
            nei['desp'] = self.css_first(nei_selector,
                                         '.communityIntroduce::text',
                                         deep_text=True,
                                         allow_empty=True)
            nei['url'] = SITE_URL +\
                self.css_first(nei_selector, '.communityIntroduce a::attr(href)', allow_empty=True)
            nei_keys = self.css(nei_selector, '.communityDetail p::text')
            nei_values = self.css(nei_selector,
                                  '.communityDetail p > *',
                                  deep_text=True)
            nei['info'] = dict_from_tuple(nei_keys, nei_values)

        # sublets 分租套房、雅房
        sublets_keys = self.css(response, '.list-title span', deep_text=True)
        sublets_list = response.css('.house-list')
        sublets = []
        for sublet in sublets_list:
            texts = self.css(sublet, 'li', deep_text=True)
            sublet_dict = dict_from_tuple(sublets_keys, texts)
            if '租金' in sublet_dict:
                sublet_dict['租金'] = clean_number(sublet_dict['租金'])
            if '坪數' in sublet_dict:
                sublet_dict['坪數'] = clean_number(sublet_dict['坪數'])

            sublets.append(sublet_dict)

        # desp
        desp = self.css(response, '.houseIntro *', deep_text=True)

        # q and a
        # TODO
        # TODO: format correct

        # price
        # <div class="price clearfix"><i>14,500 <b>元/月</b></i></div>
        price = self.css_first(response, '.price i', deep_text=True)

        # built-in facility
        price_includes = self.css_first(response,
                                        '.detailInfo .price+.explain',
                                        deep_text=True,
                                        allow_empty=True).split('/')

        # lease status
        is_deal = len(response.css('.filled').extract()) > 0
        # house_state = 'OPENED'
        # deal_at = None
        # if is_deal:
        #     house_state = 'DEAL'
        #     deal_at = timezone.localtime()

        # side meta
        sides = self.css(response, '.detailInfo .attr li', deep_text=True)
        side_metas = {}
        for side in sides:
            tokens = side.split(':')
            if len(tokens) >= 2:
                side_metas[tokens[0]] = ':'.join(tokens[1::])

        # 格局 :    3房2廳2衛2陽台
        if '格局' in side_metas:
            # TODO: 開放式格局
            parts = re.findall(r'(\d)([^\d]+)', side_metas['格局'])
            parts_dict = {}
            for part in parts:
                parts_dict[part[1]] = part[0]
            side_metas['格局'] = parts_dict
        if '坪數' in side_metas:
            side_metas['坪數'] = clean_number(side_metas['坪數'])
        if '權狀坪數' in side_metas:
            side_metas['權狀坪數'] = clean_number(side_metas['權狀坪數'])

        # due day
        due_day = self.css_first(response, '.explain .ft-rt', deep_text=True)
        due_day = due_day.replace('有效期:', '')

        # owner
        owner = {}
        owner['name'] = self.css_first(response,
                                       '.avatarRight i',
                                       deep_text=True)
        owner['comment'] = self.css_first(response,
                                          '.avatarRight div',
                                          deep_text=True)
        agent_info = self.css(response,
                              '.avatarRight .auatarSonBox p',
                              deep_text=True)
        make_agent_info = partial(split_string_to_dict, seperator=':')
        agent_info = list(map(make_agent_info, agent_info))
        owner['isAgent'] = len(agent_info) > 0
        owner['agent'] = agent_info

        phone_ext = self.css_first(response,
                                   '.phone-hide .num',
                                   deep_text=True,
                                   allow_empty=True)
        phone_url = response.css('.phone-hide .num img').xpath(
            '@src').extract_first()

        if phone_ext:
            # phone will be pure text when owner use 591 built-in phone number
            # TODO: check is the ext is identical for the same owner
            owner['id'] = phone_ext
        elif phone_url:
            # or it will be an img, the src would be identical for the same owner
            # url is sth like
            # statics.591.com.tw/tools/showPhone.php?info_data=%2BbRfNLlKoLNhHOKui2zb%2FBxYO6A&type=rLEFMu4XrrpgEw
            parsed_url = urlparse(phone_url)
            qs = parse_qs(parsed_url.query)
            if 'info_data' in qs and qs['info_data']:
                owner['id'] = qs['info_data'][0]
        else:
            # sth strange happened, such as it's already dealt
            # let's try if there's avatar
            avatar = response.css('.userInfo .avatar img').xpath(
                '@src').extract_first()
            if avatar and 'no-photo-new.png' not in avatar:
                owner['id'] = avatar
            else:
                # last try, search description to see if there's phone number
                phone = re.search(r'09[0-9]{8}', ' '.join(desp))
                if phone:
                    phone = phone.group()
                    owner['id'] = phone

        return {
            'house_id': response.meta['rental'].id,
            'n_views': self.css_first(response, '.pageView b', deep_text=True),
            'top_region': top_region,
            'sub_region': sub_region,
            'address': address,
            'title': title,
            'imgs': imgs,
            'top_metas': top_metas,
            'facilities': fa,
            'without_facilities': without_fa,
            'environment': env,
            'sublets': sublets,
            'neighbor': nei,
            'desp': desp,
            'price': price,
            'price_includes': price_includes,
            'is_deal': is_deal,
            'side_metas': side_metas,
            'due_day': due_day,
            'owner': owner
        }
예제 #12
0
    def get_shared_basic(self, detail_dict):
        ret = {}

        # region xx市/xx區/物件類型
        breadcrumb = list_to_dict(get(detail_dict, 'breadcrumb', default=[]),
                                  name_field='query',
                                  value_field='name')
        top_region = get(breadcrumb, 'region', default='__UNKNOWN__')
        sub_region = get(breadcrumb, 'section', default='__UNKNOWN__')

        ret['top_region'] = self.get_enum(enums.TopRegionType,
                                          detail_dict['house_id'], top_region)

        ret['sub_region'] = self.get_enum(
            enums.SubRegionType, detail_dict['house_id'],
            '{}{}'.format(top_region, sub_region))

        ret['rough_address'] = get(detail_dict, 'favData.address')

        # deal_status
        dealDay = get(detail_dict, 'dealTime', 0)
        if dealDay > 0:
            # Issue #15, update only deal_status in crawler
            # let `syncstateful` to update the rest
            ret['deal_status'] = enums.DealStatusType.DEAL
        else:
            # Issue #14, always update deal status since item may be reopened
            ret['deal_status'] = enums.DealStatusType.OPENED

        infoSection = list_to_dict(get(detail_dict, 'info', default=[]))

        # building_type, 公寓 / 電梯大樓 / 透天
        if '型態' in infoSection:
            building_type = infoSection['型態']
            if building_type == '別墅' or building_type == '透天厝':
                ret['building_type'] = enums.BuildingType.透天
            elif building_type == '住宅大樓' or building_type == '電梯大樓':
                ret['building_type'] = enums.BuildingType.電梯大樓
            else:
                ret['building_type'] = self.get_enum(enums.BuildingType,
                                                     detail_dict['house_id'],
                                                     building_type)

        # property type
        if '類型' in infoSection:
            ret['property_type'] = self.get_enum(enums.PropertyType,
                                                 detail_dict['house_id'],
                                                 infoSection['類型'])
        elif '格局' in infoSection:
            ret['property_type'] = enums.PropertyType.整層住家

        # is_rooftop, floor, total_floor
        # TODO: use title to detect rooftop
        if '樓層' in infoSection:
            # floor_info = 1F/2F or 頂樓加蓋/2F or 整棟/2F
            floor_info = infoSection['樓層'].split('/')
            floor = clean_number(floor_info[0])
            # mark 整棟 as floor 0
            ret['floor'] = 0
            ret['total_floor'] = clean_number(floor_info[1])
            ret['is_rooftop'] = False

            if floor_info[0] == '頂樓加蓋':
                ret['is_rooftop'] = True
                ret['floor'] = ret['total_floor'] + 1
            elif 'B' in floor_info[0] and floor:
                # basement
                ret['floor'] = -floor
            elif floor:
                ret['floor'] = floor

            ret['dist_to_highest_floor'] = ret['total_floor'] - ret['floor']

        if '坪數' in infoSection:
            ret['floor_ping'] = clean_number(infoSection['坪數'])

        facilityKeys = list_to_dict(
            get(detail_dict, 'service.facility'),
            name_field='key',
            # For 陽台 only,
            # When no 陽台, name is '陽台'
            # When there's 陽台, name is 'x陽台'...
            value_field='name')
        nBalcony = clean_number(get(facilityKeys, 'balcony', default=''))
        ret['n_balcony'] = nBalcony or 0

        if '格局' in infoSection:
            apt_parts = re.findall(r'(\d)([^\d]+)', infoSection['格局'])
            apt_feature = {}
            for part in apt_parts:
                apt_feature[part[1]] = part[0]

            for name in self.apt_features:
                if self.apt_features[name] in apt_feature:
                    ret[name] = clean_number(
                        apt_feature[self.apt_features[name]])
                else:
                    ret[name] = 0

            ret['apt_feature_code'] = '{:02d}{:02d}{:02d}{:02d}'.format(
                ret['n_balcony'], ret['n_bath_room'], ret['n_bed_room'],
                ret['n_living_room'])

        return ret