def parse_price(number_string: str): #87, 社會住宅's monthly_price is a range tokens = number_string.split('~') price = clean_number(tokens[0]) ret = {'monthly_price': price} if len(tokens) >= 2: ret['min_monthly_price'] = clean_number(tokens[1]) return ret
def get_target_value(self, house, target_config): the_path = target_config['path'] section = the_path[0] child_path = the_path[1:] cursor = getattr(house, section) for step in child_path: if isinstance(cursor, dict) and cursor.get(step): cursor = cursor.get(step) else: cursor = None break if target_config.get('fn'): cursor = target_config.get('fn')(house) elif isinstance(cursor, dict): cursor = json.dumps(cursor, ensure_ascii=False) elif target_config.get('clean_number'): try: cursor = clean_number(cursor) except ValueError: logging.error(f'Invalid number: {cursor}') cursor = None return cursor
def gen_detail_shared_attrs(self, detail_dict): detail_dict['price'] = clean_number(detail_dict['price']) detail_dict['price_includes'] = list( map(lambda x: x.replace('含', ''), detail_dict['price_includes'])) if '生活機能' in detail_dict['environment']: detail_dict['environment']['生活機能'] = list( map(lambda x: x.replace('近', ''), detail_dict['environment']['生活機能'])) if '附近交通' in detail_dict['environment']: detail_dict['environment']['附近交通'] = list( map(lambda x: re.sub('[ ]', '', x.replace('近', '')), detail_dict['environment']['附近交通'])) basic_info = self.get_shared_basic(detail_dict) price_info = self.get_shared_price(detail_dict, basic_info) env_info = self.get_shared_environment(detail_dict) boolean_info = self.get_shared_boolean_info(detail_dict) misc_info = self.get_shared_misc(detail_dict) ret = { 'vendor': self.vendor, 'vendor_house_id': detail_dict['house_id'], 'monthly_price': detail_dict['price'], 'imgs': detail_dict['imgs'], **price_info, **basic_info, **env_info, **boolean_info, **misc_info } return ret
def get_list_val(house, regular_attr, top_attr=None, to_number=False): ret = None if regular_attr in house: ret = house[regular_attr] elif top_attr in house: ret = house[top_attr] if to_number and ret is not None: ret = clean_number(ret) return ret
def parse_dealtime(row: House): base_time = row.created_at # example: 17天成交(1) days_taken = row.list_meta.get('addInfo').split('天')[0] # example: 今日、昨日、x天前 last_updated = row.list_meta.get('posttime') try: days_taken = clean_number(days_taken) except ValueError: return None if days_taken is None: return None if last_updated == '今日': pass elif last_updated == '昨日': base_time -= timedelta(days=1) else: base_time -= timedelta(days=clean_number(last_updated)) return base_time.astimezone(taipei_time).strftime('%Y-%m-%d')
def count_and_parse_list(self, response): meta = response.meta['rental'] data = json.loads(response.text) if meta.page == 0: count = clean_number(data['records']) logging.info(f'[{meta.name}] total {count} house to crawl!') # #items return per request may differ from API endpoint self.N_PAGE = len(data['data']['data']) # generate all list request as now we know number of result cur_page = 1 while cur_page * self.N_PAGE < count: yield self.gen_list_request( util.ListRequestMeta(meta.id, meta.name, cur_page)) cur_page += 1 houses = data['data']['data'] if not self.novip: houses = data['data']['topData'] + houses for house in houses: # copy from twrh house['is_vip'] = 'id' not in house house_item = self.gen_shared_attrs(house, meta) stats, created = HouseStats.get_or_create( job_id=self.job.id, house_id=house_item['vendor_house_id']) if not created: continue yield RawHouseItem(house_id=house_item['vendor_house_id'], vendor=self.vendor, is_list=True, raw=json.dumps(house, ensure_ascii=False)) yield GenericHouseItem(**house_item) yield self.gen_detail_request( util.DetailRequestMeta(house_item['vendor_house_id'], False))
def default_parse_list(self, response): data = json.loads(response.text) count = clean_number(data['records']) meta = response.meta['rental'] if meta.page == 0: # generate all list request as now we know number of result cur_page = 1 while cur_page * self.N_PAGE < count: yield self.gen_list_request( ListRequestMeta(meta.id, meta.name, cur_page)) cur_page += 1 houses = data['data']['topData'] + data['data']['data'] for house in houses: house_item = self.gen_shared_attrs(house, meta) yield RawHouseItem(house_id=house_item['vendor_house_id'], vendor=self.vendor, is_list=True, raw=json.dumps(house, ensure_ascii=False)) yield GenericHouseItem(**house_item) yield self.gen_detail_request( DetailRequestMeta(house_item['vendor_house_id']))
def gen_shared_attrs(self, house, meta: ListRequestMeta): house_id = get_list_val(house, 'id', 'post_id') url = "{}/v1/house/rent/detail?id={}".format(API_URL, house_id) if 'region_name' in house: # topData doesn't contain region_name for some reason.. top_region = self.get_enum(TopRegionType, house_id, house['region_name']) else: top_region = self.get_enum(TopRegionType, house_id, meta.name) sub_region = self.get_enum( SubRegionType, house_id, '{}{}'.format( TopRegionType(top_region).name, get_list_val(house, 'section_name', 'section_str'))) property_type = None if 'kind_name' in house: self.get_enum(PropertyType, house_id, get_list_val(house, 'kind_name')) floor = None total_floor = None if 'floor_str' in house: floor_info = house['floor_str'].split('/') if len(floor_info) >= 2: floor = clean_number(floor_info[0]) total_floor = clean_number(floor_info[1]) if floor == '頂樓加蓋': floor = total_floor + 1 elif 'B' in floor_info[0] and floor: # basement floor = -floor elif floor is None: # 整棟 floor = 0 price_range = parse_price(get_list_val(house, 'price')) generic_house = { 'vendor': self.vendor, 'vendor_house_id': house_id, 'vendor_house_url': url, 'imgs': get_list_val(house, 'photo_list'), 'top_region': top_region, 'sub_region': sub_region, 'property_type': property_type, 'floor_ping': clean_number(house['area']), 'floor': floor, 'total_floor': total_floor, **price_range } # 99 and 100 are magic number in 591... # https://github.com/g0v/tw-rental-house-data/issues/11 if generic_house['floor'] == 99: generic_house['floor'] = 0 elif generic_house['floor'] == 100 and generic_house['total_floor']: generic_house['floor'] = generic_house['total_floor'] + 1 empty_keys = [] for key in generic_house: if generic_house[key] is None: empty_keys.append(key) for key in empty_keys: del generic_house[key] return generic_house
def get_shared_basic(self, detail_dict): ret = {} # top_region, sub_region if 'top_region' in detail_dict: ret['top_region'] = self.get_enum(enums.TopRegionType, detail_dict['house_id'], detail_dict['top_region']) ret['sub_region'] = self.get_enum( enums.SubRegionType, detail_dict['house_id'], '{}{}'.format(detail_dict['top_region'], detail_dict['sub_region'])) if 'address' in detail_dict: ret['rough_address'] = detail_dict['address'] # deal_status if detail_dict['is_deal']: # Issue #15, update only deal_status in crawler # let `syncstateful` to update the rest ret['deal_status'] = enums.DealStatusType.DEAL else: # Issue #14, always update deal status since item may be reopened ret['deal_status'] = enums.DealStatusType.OPENED # building_type, 公寓 / 電梯大樓 / 透天 if '型態' in detail_dict['side_metas']: building_type = detail_dict['side_metas']['型態'] if building_type == '別墅' or building_type == '透天厝': ret['building_type'] = enums.BuildingType.透天 elif building_type == '住宅大樓': ret['building_type'] = enums.BuildingType.電梯大樓 else: ret['building_type'] = self.get_enum(enums.BuildingType, detail_dict['house_id'], building_type) # property type if '現況' in detail_dict['side_metas']: ret['property_type'] = self.get_enum( enums.PropertyType, detail_dict['house_id'], detail_dict['side_metas']['現況']) # is_rooftop, floor, total_floor # TODO: use title to detect rooftop if '樓層' in detail_dict['side_metas']: # floor_info = 1F/2F or 頂樓加蓋/2F or 整棟/2F floor_info = detail_dict['side_metas']['樓層'].split('/') floor = clean_number(floor_info[0]) ret['floor'] = 0 ret['total_floor'] = clean_number(floor_info[1]) ret['is_rooftop'] = False if floor_info[0] == '頂樓加蓋': ret['is_rooftop'] = True ret['floor'] = ret['total_floor'] + 1 elif 'B' in floor_info[0] and floor: # basement ret['floor'] = -floor elif floor: ret['floor'] = floor ret['dist_to_highest_floor'] = ret['total_floor'] - ret['floor'] if '坪數' in detail_dict['side_metas']: ret['floor_ping'] = clean_number(detail_dict['side_metas']['坪數']) if '格局' in detail_dict['side_metas']: apt_feature = detail_dict['side_metas']['格局'] for name in self.apt_features: if self.apt_features[name] in apt_feature: ret[name] = clean_number( apt_feature[self.apt_features[name]]) else: ret[name] = 0 ret['apt_feature_code'] = '{:02d}{:02d}{:02d}{:02d}'.format( ret['n_balcony'], ret['n_bath_room'], ret['n_bed_room'], ret['n_living_room']) # TODO: rough_address return ret
def get_shared_price(self, detail_dict, basic_info): ret = {} # deposit_type, n_month_deposit if '押金' in detail_dict['top_metas']: deposit = detail_dict['top_metas']['押金'] month_deposit = deposit.split('個月') if len(month_deposit) == 2: ret['deposit_type'] = enums.DepositType.月 ret['n_month_deposit'] = self.from_zh_number(month_deposit[0]) ret['deposit'] = ret['n_month_deposit'] * detail_dict['price'] elif deposit.replace(',', '').isdigit(): ret['deposit'] = clean_number(deposit) n_month = ret['deposit'] / detail_dict['price'] ret['deposit_type'] = enums.DepositType.定額 ret['n_month_deposit'] = n_month elif deposit == '面議': ret['deposit_type'] = enums.DepositType.面議 ret['n_month_deposit'] = None ret['deposit'] = None else: ret['deposit_type'] = enums.DepositType.其他 ret['n_month_deposit'] = None ret['deposit'] = None # is_remanagement_fee, monthly_management_fee if '管理費' in detail_dict['price_includes']: ret['is_require_management_fee'] = False ret['monthly_management_fee'] = 0 elif '管理費' in detail_dict['top_metas']: mgmt_fee = detail_dict['top_metas']['管理費'] # could be xxx元/月, --, -, !@$#$%... if '元/月' in mgmt_fee: ret['is_require_management_fee'] = True ret['monthly_management_fee'] = clean_number(mgmt_fee) else: ret['is_require_management_fee'] = False ret['monthly_management_fee'] = 0 # *_parking* if '車 位' in detail_dict['top_metas']: parking_str = detail_dict['top_metas']['車 位'] parking = clean_number(parking_str) ret['has_parking'] = True if parking: ret['is_require_parking_fee'] = True ret['monthly_parking_fee'] = parking elif '已含' in parking_str: ret['is_require_parking_fee'] = False ret['monthly_parking_fee'] = 0 elif '費用另計' in parking_str: ret['is_require_parking_fee'] = True ret['monthly_parking_fee'] = 0 elif parking_str == '無': ret['has_parking'] = False # per ping price if 'floor_ping' in basic_info: mgmt = ret.get('monthly_management_fee', 0) parking = ret.get('monthly_parking_fee', 0) price = detail_dict['price'] total_price = price + mgmt + parking ret['per_ping_price'] = total_price / basic_info['floor_ping'] return ret
def collect_dict(self, response): # title title = self.css_first(response, '.houseInfoTitle', deep_text=True) # region 首頁/租屋/xx市/xx區 breadcromb = self.css(response, '#propNav a', deep_text=True) if len(breadcromb) >= 4: if breadcromb[2] == '出租' and len(breadcromb) >= 5: # 首頁 > 店面 > 出租 > 台北市 > 大安區 > 台北市大安區安和路二段 top_region = breadcromb[3] sub_region = breadcromb[4] else: # 首頁 > 租屋 > 台北市 > 大安區 > 獨立套房 > 20000-30000元 > 台北市大安區仁愛路四段50號 top_region = breadcromb[2] sub_region = breadcromb[3] else: top_region = '__UNKNOWN__' sub_region = '__UNKNOWN__' # rough address address = self.css_first(response, '#propNav .addr', deep_text=True) # image, it's in a hidden input imgs = self.css_first(response, '#hid_imgArr::attr(value)', allow_empty=True).replace('"', '').split(',') if imgs[0] == "": imgs.pop(0) # top meta, including 押金, 法定用途, etc.. top_meta_keys = self.css(response, '.labelList-1 .one', deep_text=True) top_meta_values = self.css(response, '.labelList-1 .two em', deep_text=True) top_metas = dict_from_tuple(top_meta_keys, top_meta_values) if '身份要求' in top_metas: top_metas['身份要求'] = top_metas['身份要求'].split('、') # facilities, including 衣櫃、沙發, etc.. fa_status = self.css(response, '.facility li span::attr(class)') fa_text = self.css(response, '.facility li', deep_text=True) fa = [] without_fa = [] for index, key in enumerate(fa_text): if fa_status[index] != 'no': fa.append(key) else: without_fa.append(key) # environment # <p><strong>生活機能</strong>:近便利商店;傳統市場;夜市</p> env_keys = self.css(response, '.lifeBox > p strong', deep_text=True) env_desps = self.css(response, '.lifeBox > p', deep_text=True) env_desps = list( map(lambda desp: re.sub('.*:', '', desp).split(';'), env_desps)) env = dict_from_tuple(env_keys, env_desps) # neighbor nei_selector = response.css('.lifeBox.community') nei = {} if nei_selector: nei['name'] = self.css_first(nei_selector, '.communityName a', deep_text=True) nei['desp'] = self.css_first(nei_selector, '.communityIntroduce::text', deep_text=True, allow_empty=True) nei['url'] = SITE_URL +\ self.css_first(nei_selector, '.communityIntroduce a::attr(href)', allow_empty=True) nei_keys = self.css(nei_selector, '.communityDetail p::text') nei_values = self.css(nei_selector, '.communityDetail p > *', deep_text=True) nei['info'] = dict_from_tuple(nei_keys, nei_values) # sublets 分租套房、雅房 sublets_keys = self.css(response, '.list-title span', deep_text=True) sublets_list = response.css('.house-list') sublets = [] for sublet in sublets_list: texts = self.css(sublet, 'li', deep_text=True) sublet_dict = dict_from_tuple(sublets_keys, texts) if '租金' in sublet_dict: sublet_dict['租金'] = clean_number(sublet_dict['租金']) if '坪數' in sublet_dict: sublet_dict['坪數'] = clean_number(sublet_dict['坪數']) sublets.append(sublet_dict) # desp desp = self.css(response, '.houseIntro *', deep_text=True) # q and a # TODO # TODO: format correct # price # <div class="price clearfix"><i>14,500 <b>元/月</b></i></div> price = self.css_first(response, '.price i', deep_text=True) # built-in facility price_includes = self.css_first(response, '.detailInfo .price+.explain', deep_text=True, allow_empty=True).split('/') # lease status is_deal = len(response.css('.filled').extract()) > 0 # house_state = 'OPENED' # deal_at = None # if is_deal: # house_state = 'DEAL' # deal_at = timezone.localtime() # side meta sides = self.css(response, '.detailInfo .attr li', deep_text=True) side_metas = {} for side in sides: tokens = side.split(':') if len(tokens) >= 2: side_metas[tokens[0]] = ':'.join(tokens[1::]) # 格局 : 3房2廳2衛2陽台 if '格局' in side_metas: # TODO: 開放式格局 parts = re.findall(r'(\d)([^\d]+)', side_metas['格局']) parts_dict = {} for part in parts: parts_dict[part[1]] = part[0] side_metas['格局'] = parts_dict if '坪數' in side_metas: side_metas['坪數'] = clean_number(side_metas['坪數']) if '權狀坪數' in side_metas: side_metas['權狀坪數'] = clean_number(side_metas['權狀坪數']) # due day due_day = self.css_first(response, '.explain .ft-rt', deep_text=True) due_day = due_day.replace('有效期:', '') # owner owner = {} owner['name'] = self.css_first(response, '.avatarRight i', deep_text=True) owner['comment'] = self.css_first(response, '.avatarRight div', deep_text=True) agent_info = self.css(response, '.avatarRight .auatarSonBox p', deep_text=True) make_agent_info = partial(split_string_to_dict, seperator=':') agent_info = list(map(make_agent_info, agent_info)) owner['isAgent'] = len(agent_info) > 0 owner['agent'] = agent_info phone_ext = self.css_first(response, '.phone-hide .num', deep_text=True, allow_empty=True) phone_url = response.css('.phone-hide .num img').xpath( '@src').extract_first() if phone_ext: # phone will be pure text when owner use 591 built-in phone number # TODO: check is the ext is identical for the same owner owner['id'] = phone_ext elif phone_url: # or it will be an img, the src would be identical for the same owner # url is sth like # statics.591.com.tw/tools/showPhone.php?info_data=%2BbRfNLlKoLNhHOKui2zb%2FBxYO6A&type=rLEFMu4XrrpgEw parsed_url = urlparse(phone_url) qs = parse_qs(parsed_url.query) if 'info_data' in qs and qs['info_data']: owner['id'] = qs['info_data'][0] else: # sth strange happened, such as it's already dealt # let's try if there's avatar avatar = response.css('.userInfo .avatar img').xpath( '@src').extract_first() if avatar and 'no-photo-new.png' not in avatar: owner['id'] = avatar else: # last try, search description to see if there's phone number phone = re.search(r'09[0-9]{8}', ' '.join(desp)) if phone: phone = phone.group() owner['id'] = phone return { 'house_id': response.meta['rental'].id, 'n_views': self.css_first(response, '.pageView b', deep_text=True), 'top_region': top_region, 'sub_region': sub_region, 'address': address, 'title': title, 'imgs': imgs, 'top_metas': top_metas, 'facilities': fa, 'without_facilities': without_fa, 'environment': env, 'sublets': sublets, 'neighbor': nei, 'desp': desp, 'price': price, 'price_includes': price_includes, 'is_deal': is_deal, 'side_metas': side_metas, 'due_day': due_day, 'owner': owner }
def get_shared_basic(self, detail_dict): ret = {} # region xx市/xx區/物件類型 breadcrumb = list_to_dict(get(detail_dict, 'breadcrumb', default=[]), name_field='query', value_field='name') top_region = get(breadcrumb, 'region', default='__UNKNOWN__') sub_region = get(breadcrumb, 'section', default='__UNKNOWN__') ret['top_region'] = self.get_enum(enums.TopRegionType, detail_dict['house_id'], top_region) ret['sub_region'] = self.get_enum( enums.SubRegionType, detail_dict['house_id'], '{}{}'.format(top_region, sub_region)) ret['rough_address'] = get(detail_dict, 'favData.address') # deal_status dealDay = get(detail_dict, 'dealTime', 0) if dealDay > 0: # Issue #15, update only deal_status in crawler # let `syncstateful` to update the rest ret['deal_status'] = enums.DealStatusType.DEAL else: # Issue #14, always update deal status since item may be reopened ret['deal_status'] = enums.DealStatusType.OPENED infoSection = list_to_dict(get(detail_dict, 'info', default=[])) # building_type, 公寓 / 電梯大樓 / 透天 if '型態' in infoSection: building_type = infoSection['型態'] if building_type == '別墅' or building_type == '透天厝': ret['building_type'] = enums.BuildingType.透天 elif building_type == '住宅大樓' or building_type == '電梯大樓': ret['building_type'] = enums.BuildingType.電梯大樓 else: ret['building_type'] = self.get_enum(enums.BuildingType, detail_dict['house_id'], building_type) # property type if '類型' in infoSection: ret['property_type'] = self.get_enum(enums.PropertyType, detail_dict['house_id'], infoSection['類型']) elif '格局' in infoSection: ret['property_type'] = enums.PropertyType.整層住家 # is_rooftop, floor, total_floor # TODO: use title to detect rooftop if '樓層' in infoSection: # floor_info = 1F/2F or 頂樓加蓋/2F or 整棟/2F floor_info = infoSection['樓層'].split('/') floor = clean_number(floor_info[0]) # mark 整棟 as floor 0 ret['floor'] = 0 ret['total_floor'] = clean_number(floor_info[1]) ret['is_rooftop'] = False if floor_info[0] == '頂樓加蓋': ret['is_rooftop'] = True ret['floor'] = ret['total_floor'] + 1 elif 'B' in floor_info[0] and floor: # basement ret['floor'] = -floor elif floor: ret['floor'] = floor ret['dist_to_highest_floor'] = ret['total_floor'] - ret['floor'] if '坪數' in infoSection: ret['floor_ping'] = clean_number(infoSection['坪數']) facilityKeys = list_to_dict( get(detail_dict, 'service.facility'), name_field='key', # For 陽台 only, # When no 陽台, name is '陽台' # When there's 陽台, name is 'x陽台'... value_field='name') nBalcony = clean_number(get(facilityKeys, 'balcony', default='')) ret['n_balcony'] = nBalcony or 0 if '格局' in infoSection: apt_parts = re.findall(r'(\d)([^\d]+)', infoSection['格局']) apt_feature = {} for part in apt_parts: apt_feature[part[1]] = part[0] for name in self.apt_features: if self.apt_features[name] in apt_feature: ret[name] = clean_number( apt_feature[self.apt_features[name]]) else: ret[name] = 0 ret['apt_feature_code'] = '{:02d}{:02d}{:02d}{:02d}'.format( ret['n_balcony'], ret['n_bath_room'], ret['n_bed_room'], ret['n_living_room']) return ret