class Hse28Scrapper(Hse28Setting): def __init__(self): self.scrapper = WebScrapper() self.io = FileIO() self.notifier = TelegramNotifier() def _search_result_checker(self, html): return check_html_element_exist(html, self.RESULT_TAG) def _property_page_checker(self, html): return check_html_element_exist(html, self.PROPERTY_TAB) def get_last_max_id(self): temp_id = self.io.load_file(DATA_PATH, self.TEMP_ID_FILE, 'json') max_id = int(temp_id['max_id']) return max_id async def sort_filter_page(self, session): session = await async_fill_box(session, self.LOW_PRICE_TAG, self.LOWEST_PRICE) session = await async_fill_box(session, self.HIGH_PRICE_TAG, self.HIGHEST_PRICE) session = await async_click_button(session, self.PRICE_BUTTON_TAG, self.LOAD_TAG) session = await async_click_option(session, self.SORT_TAG, self.LOAD_TAG) return session def get_search_url_list(self): url_list = [ self.RESULT_URL.format(district) for district in self.DISTRICT ] return url_list def get_html_result_pages(self, url_list): response_list = self.scrapper.browse_multiple_html( url_list, extra_action=self.sort_filter_page, html_checker=self._search_result_checker, asyn=True) return response_list @staticmethod def _get_property_id_from_url(url): if 'https://www.28hse.com/buy-property-' not in url: property_id = -1 else: property_id = int( url.replace('https://www.28hse.com/buy-property-', '').replace('.html', '')) return property_id def get_property_url_from_html(self, response, max_id): if response['ok']: soup = BeautifulSoup(response['message'], 'html.parser') result_list = soup.select(self.RESULT_TAG)[0] property_url_list = [ property_tag.get('href') for property_tag in result_list.select('a') ] property_url_list = [ property_url for property_url in property_url_list if self._get_property_id_from_url(property_url) > max_id ] return property_url_list else: self.notifier.send_message( 'Unable to load result for page {}. {}'.format( response['url'], response['error'])) return [] def aggregate_property_url(self, response_list, max_id): property_url_list = list( map( lambda response: self.get_property_url_from_html( response, max_id), response_list)) property_url_list = [ url for property_url in property_url_list for url in property_url ] return property_url_list def get_property_pages(self, property_url_list): response_list = self.scrapper.load_multiple_html( property_url_list, html_checker=self._property_page_checker) return response_list def extract_property_pages(self, response): if response['ok']: soup = BeautifulSoup(response['message'], 'html.parser') property_table = pd.read_html( str(soup.select(self.PROPERTY_TAB)[0]))[0] property_table = property_table.set_index(0).T property_table = property_table.assign(**{'URL': response['url']}) return property_table else: self.notifier.send_message( 'Unable to load property for page {}. {}'.format( response['url'], response['error'])) return pd.DataFrame([]) def aggregate_property_table(self, response_list): property_data_list = list( map(self.extract_property_pages, response_list)) if len(property_data_list) > 0: property_data = pd.concat(property_data_list, sort=False) property_data = property_data.set_index('28HSE 樓盤編號') else: property_data = pd.DataFrame([]) return property_data def clean_filter_property_table(self, property_data): property_data = property_data.assign( **{ '售價': property_data.loc[:, '售價'].str.split('每月供款').str[0], '物業地址': property_data.loc[:, '物業地址'].str.replace('屋苑位置', '') }) property_data = property_data.loc[ (property_data.loc[:, '實用面積(呎)'].str.replace('[^0-9]', ''). astype(float) >= self.SMALLEST_SIZE) | (property_data.loc[:, '實用面積(呎)'].isnull())] property_data = property_data.loc[ (property_data.loc[:, '樓齡(年)'].str.replace('[^0-9]', ''). astype(float) <= self.OLDEST_PROPERTY) | (property_data.loc[:, '樓齡(年)'].isnull())] property_data = property_data.loc[~property_data.loc[:, '售價'].str. contains('居屋')] property_data = property_data.drop( ['物業編號', '樓盤狀態', '瀏覽人次', '收藏人次', '刊登或續期日', '記錄更新', '放盤到期日'], axis=1) property_data = property_data.fillna('') return property_data def save_max_id(self, property_data): max_id = str(property_data.index.astype(int).max()) if max_id != 'nan': max_id = {'max_id': max_id} self.io.save_file(max_id, DATA_PATH, self.TEMP_ID_FILE, 'json') @staticmethod def convert_property_dict_to_str(property_dict): message_str = '' for key, value in property_dict.items(): message_str += '{}: {}\n'.format(key, value) return message_str def send_property_details(self, property_data): property_records = property_data.to_dict('records') message_list = list( map(self.convert_property_dict_to_str, property_records)) for chat_id in CHAT_ID: self.notifier.CHAT_ID = chat_id list(map(self.notifier.send_message, message_list)) def hse28_scrapping(self): max_id = self.get_last_max_id() url_list = self.get_search_url_list() response_list = self.get_html_result_pages(url_list) property_url_list = self.aggregate_property_url(response_list, max_id) response_list = self.get_property_pages(property_url_list) property_data = self.aggregate_property_table(response_list) if len(property_data) > 0: property_data = self.clean_filter_property_table(property_data) if len(property_data) > 0: self.save_max_id(property_data) self.send_property_details(property_data)
class Modeller(ModellerSetting): def __init__(self): self.file_io = FileIO() self.visual = Visualization() self.logger = set_logger(LOGGER_PATH, self.LOGGER_FILE, self.LOGGER_LEVEL, __name__) ktf.set_session(get_session()) def save_estimator(self, estimator, file_name): self.file_io.save_file(estimator, self.MODEL_PATH, file_name, 'joblib') def load_estimator(self, file_name): estimator = self.file_io.load_file(self.MODEL_PATH, file_name, 'joblib') return estimator @staticmethod def notify_model_fitting_failure(message): send_message('Error in fitting model. {}'.format(message), NOTIFIER_AGENT) def try_save_notify_exit(self, func, estimator, x_data, y_data, *args, model_file_name=None, verb=False, **kwargs): try: estimator = func(estimator, x_data, y_data, *args, **kwargs) if model_file_name is not None: self.save_estimator(estimator, model_file_name) if verb: self.logger.debug( 'Model {} fitting completed'.format(model_file_name)) return estimator except KeyboardInterrupt: sys.exit() except Exception as e: self.notify_model_fitting_failure('{}. model_file_name: {}'.format( e, model_file_name)) sys.exit() def set_feature_scaler(self, name, **kwargs): if name in self.SCALER_DICT: scaler = self.SCALER_DICT[name](**kwargs) else: self.logger.error('Unknown scaler name {}'.format(name)) scaler = None return scaler @staticmethod def feature_scaling(scaler, x_data, fit=True): if fit: model = scaler.fit(x_data) return model else: transformed_data = scaler.transform(x_data) return transformed_data def set_estimator(self, name, **kwargs): if name in self.ESTIMATOR_DICT: estimator = self.ESTIMATOR_DICT[name](**kwargs) else: self.logger.error('Unknown estimator name {}'.format(name)) estimator = None return estimator @staticmethod def _estimator_fit(estimator, x_data, y_data, **kwargs): estimator = estimator.fit(x_data, y_data, **kwargs) return estimator def pure_estimation(self, estimator, x_data, y_data, fit=True, model_file_name=None, verb=False, **kwargs): if fit: estimator = self.try_save_notify_exit( self._estimator_fit, estimator, x_data, y_data, model_file_name=model_file_name, verb=verb, **kwargs) return estimator else: # ktf.get_session().run(tf.global_variables_initializer()) estimated_data = estimator.predict(x_data) return estimated_data def model_residual(self, estimator, x_data, y_data, fit=False, model_file_name=None, verb=False, **kwargs): if fit: estimator = self.pure_estimation(estimator, x_data, y_data, True, model_file_name, verb, **kwargs) estimated_data = self.pure_estimation(estimator, x_data, y_data, False, model_file_name, verb) residual_data = y_data - estimated_data return residual_data @staticmethod def metric_to_scorer(metric, **kwargs): return make_scorer(metric, **kwargs) def set_scorer(self, name, make_score=True, **kwargs): if name in self.SCORER_DICT: scorer = self.SCORER_DICT[name] if make_score: scorer = self.metric_to_scorer( scorer['func'], greater_is_better=scorer['greater_better'], **kwargs) else: scorer = scorer['func'] else: self.logger.error('Unknown scorer name {}'.format(name)) scorer = None return scorer def model_scoring(self, estimator, x_data, y_data, metric, fit=False, model_file_name=None, verb=False, **kwargs): if fit: estimator = self.pure_estimation(estimator, x_data, y_data, True, model_file_name, verb, **kwargs) estimated_data = self.pure_estimation(estimator, x_data, y_data, False, model_file_name, verb) score = metric(y_data, estimated_data) return score @staticmethod def set_estimation_pipeline(scaler, estimator): estimator = Pipeline([('scaler', scaler), ('estimator', estimator)]) return estimator def train_valid_evaluation(self, estimator, x_train, y_train, x_valid, y_valid, scorer, fit=False, model_file_name=None, verb=False, **kwargs): if fit: estimator = self.pure_estimation(estimator, x_train, y_train, True, model_file_name, verb, **kwargs) train_score = self.model_scoring(estimator, x_train, y_train, scorer, False) valid_score = self.model_scoring(estimator, x_valid, y_valid, scorer, False) score = {'train': train_score, 'valid': valid_score} return score def set_cv(self, cv_name, **kwargs): if cv_name in self.CV_DICT: cv = self.CV_DICT[cv_name](**kwargs) else: self.logger.error('Unknown scorer name {}'.format(cv_name)) cv = None return cv @staticmethod def _cross_validation(estimator, x_data, y_data, scorer, **kwargs): estimator = cross_validate(estimator, x_data, y_data, scoring=scorer, **kwargs) return estimator def cross_validation(self, estimator, x_data, y_data, scorer, model_file_name=None, verb=False, **kwargs): estimator = self.try_save_notify_exit(self._cross_validation, estimator, x_data, y_data, scorer, model_file_name=model_file_name, verb=verb, **kwargs) return estimator def validation_curve(self, estimator, x_data, y_data, para_range_dict, scorer, plot_file_name, **kwargs): para_name = list(para_range_dict.keys())[0] para_values = para_range_dict[para_name] train_score, valid_score = validation_curve(estimator, x_data, y_data, para_name, para_values, scoring=scorer, **kwargs) train_score = np.mean(train_score, axis=1) valid_score = np.mean(valid_score, axis=1) data = pd.DataFrame({ **para_range_dict, **{ 'train_score': train_score, 'valid_score': valid_score } }) self.visual.time_series_plot( plot_file_name, data, para_name, ['train_score', 'valid_score'], title_dict={ 'title': 'Validation curve for parameter {}'.format(para_name), 'x_title': para_name, 'y_title': 'score' }) return data def learning_curve(self, estimator, x_data, y_data, scorer, plot_file_name, **kwargs): train_sizes, train_score, valid_score = learning_curve(estimator, x_data, y_data, scoring=scorer, **kwargs) data = pd.DataFrame({ 'train_size': train_sizes, 'train_score': train_score, 'valid_score': valid_score }) self.visual.time_series_plot(plot_file_name, data, 'train_size', ['train_score', 'valid_score'], title_dict={ 'title': 'Learning curve', 'x_title': 'train_size', 'y_title': 'score' }) return data def para_search(self, estimator, x_data, y_data, method, search_para, scorer, model_file_name=None, verb=False, **kwargs): searcher = self.SEARCHER_DICT[method] estimator = searcher(estimator, search_para, scoring=scorer, **kwargs) estimator = self.pure_estimation(estimator, x_data, y_data, True, model_file_name, verb) return estimator def hyperopt_search(self, scaler_dict, estimator_dict, x_data, y_data, method, search_para, scorer, model_file_name=None, verb=False, **kwargs): set_feature_scaler = self.set_feature_scaler set_estimator = self.set_estimator set_estimation_pipeline = self.set_estimation_pipeline def hyperopt_min_func(space): scaler_kwargs = scaler_dict['kwargs'] estimator_kwargs = estimator_dict['kwargs'] for param_key, param_value in space.items(): if 'scaler' in param_key: if param_value['dtype'] is None: scaler_kwargs[param_key.replace( 'scaler__', '')] = param_value['dist'] else: scaler_kwargs[param_key.replace( 'scaler__', '')] = param_value['dtype'](param_value['dist']) if 'estimator' in param_key: if param_value['dtype'] is None: estimator_kwargs[param_key.replace( 'estimator__', '')] = param_value['dist'] else: estimator_kwargs[param_key.replace( 'estimator__', '')] = param_value['dtype'](param_value['dist']) if 'optimizer' in estimator_kwargs and 'lr' in estimator_kwargs: estimator_kwargs['optimizer'] = estimator_kwargs['optimizer']( lr=estimator_kwargs['lr']) estimator_kwargs.pop('lr') scaler = set_feature_scaler(scaler_dict['name'], **scaler_dict['kwargs']) estimator = set_estimator(estimator_dict['name'], **estimator_dict['kwargs']) model = set_estimation_pipeline(scaler, estimator) score = -cross_val_score( model, x_data, y_data, scoring=scorer, **kwargs).mean() if 'nn' in estimator_dict['name'] or 'lstm' in estimator_dict[ 'name'] or 'cnn' in estimator_dict['name']: K.clear_session() return score searcher = self.SEARCHER_DICT[method] trials = Trials() best = fmin(fn=hyperopt_min_func, space=search_para, algo=searcher, max_evals=self.HYPEROPT_MAX_ITER, trials=trials) log = trials.trials self.save_estimator(log, 'tmp_hyperpt_search_log.pkl') best_para = {key: best[key] for key in search_para.keys()} self.save_estimator(best_para, 'temp_hyperopt_best_para.pkl') scaler_kwargs = scaler_dict['kwargs'] estimator_kwargs = estimator_dict['kwargs'] for param_key, param_value in best_para.items(): if 'scaler' in param_key: if search_para[param_key]['dtype'] is None: scaler_kwargs[param_key.replace( 'scaler__', '')] = search_para[param_key]['choice'][param_value] else: scaler_kwargs[param_key.replace( 'scaler__', '')] = search_para[param_key]['dtype'](param_value) if 'estimator' in param_key: if search_para[param_key]['dtype'] is None: estimator_kwargs[param_key.replace( 'estimator__', '')] = search_para[param_key]['choice'][param_value] else: estimator_kwargs[param_key.replace( 'estimator__', '')] = search_para[param_key]['dtype'](param_value) if 'optimizer' in estimator_kwargs and 'lr' in estimator_kwargs: estimator_kwargs['optimizer'] = estimator_kwargs['optimizer']( lr=estimator_kwargs['lr']) estimator_kwargs.pop('lr') scaler = set_feature_scaler(scaler_dict['name'], **scaler_dict['kwargs']) estimator = set_estimator(estimator_dict['name'], **estimator_dict['kwargs']) estimator = set_estimation_pipeline(scaler, estimator) estimator = self.pure_estimation(estimator, x_data, y_data, True, model_file_name, verb, **kwargs['fit_params']) return estimator, log
class Hse730(Hse730Setting): def __init__(self): self.scrapper = WebScrapper() self.io = FileIO() self.notifier = TelegramNotifier() def _search_result_checker(self, html): return check_html_element_exist(html, self.RESULT_TAG) def _property_page_checker(self, html): return check_html_element_exist(html, self.PROPERTY_TAB) def get_check_id(self): sent_id = self.io.load_file(DATA_PATH, self.CHECK_ID_FILE, 'json') return sent_id def get_search_url_list(self): url_list = [ self.RESULT_URL.format(''.join(self.DISTRICT), page_no + 1, self.LOWEST_PRICE, self.HIGHEST_PRICE) for page_no in range(self.CHECK_NO_PAGE) ] return url_list def get_html_result_pages(self, url_list): response_list = self.scrapper.load_multiple_html( url_list, html_checker=self._search_result_checker, asyn=True) return response_list @staticmethod def _get_property_id_from_url(url): if '/buy-property-' not in url: property_id = -1 else: property_id = int( url.replace('https://www.house730.com', '').replace('/buy-property-', '').replace('.html', '')) return property_id def save_check_id(self, new_check_id_list): self.io.save_file(new_check_id_list, DATA_PATH, self.CHECK_ID_FILE, 'json') def get_property_url_from_html(self, response, check_id): if response['ok']: soup = BeautifulSoup(response['message'], 'html.parser') result_list = soup.select(self.RESULT_TAG)[0] property_url_list = [ property_tag.get('href') for property_tag in result_list.select('a.name') ] property_url_list = [ 'https://www.house730.com' + property_url for property_url in property_url_list if self._get_property_id_from_url(property_url) not in check_id ] return property_url_list else: self.notifier.send_message( 'Unable to load result for page {}. {}'.format( response['url'], response['error'])) return [] def aggregate_property_url(self, response_list, check_id): property_url_list = list( map( lambda response: self.get_property_url_from_html( response, check_id), response_list)) property_url_list = [ url for property_url in property_url_list for url in property_url ] return property_url_list def append_new_check_id_list(self, property_url_list, check_id): new_check_id_list = [ self._get_property_id_from_url(property_url) for property_url in property_url_list ] new_check_id_list += check_id new_check_id_list = new_check_id_list[:50] self.save_check_id(new_check_id_list) def get_property_pages(self, property_url_list): response_list = self.scrapper.load_multiple_html( property_url_list, html_checker=self._property_page_checker) return response_list def extract_property_pages(self, response): if response['ok']: soup = BeautifulSoup(response['message'], 'html.parser') property_table = pd.read_html( str(soup.select(self.PROPERTY_TAB)[0]))[0] property_table = property_table.set_index(0).T property_table = property_table.assign(**{'URL': response['url']}) return property_table else: self.notifier.send_message( 'Unable to load property for page {}. {}'.format( response['url'], response['error'])) return pd.DataFrame([]) def aggregate_property_table(self, response_list): property_data_list = list( map(self.extract_property_pages, response_list)) if len(property_data_list) > 0: property_data = pd.concat(property_data_list, sort=False) property_data = property_data.set_index('House730樓盤編號') else: property_data = pd.DataFrame([]) return property_data def clean_filter_property_table(self, property_data): property_data = property_data.assign( **{ '售價': property_data.loc[:, '售價'].str.split('按揭計算機').str[0], '樓盤地址': property_data.loc[:, '樓盤地址'].str.replace('屋苑位置', '') }) property_data = property_data.loc[ (property_data.loc[:, '實用面積(呎)'].str.replace('[^0-9]', ''). astype(float) >= self.SMALLEST_SIZE) | (property_data.loc[:, '實用面積(呎)'].isnull())] property_data = property_data.loc[ (property_data.loc[:, '樓齡(年)'].str.replace('[^0-9]', ''). astype(float) <= self.OLDEST_PROPERTY) | (property_data.loc[:, '樓齡(年)'].isnull())] property_data = property_data.loc[~property_data.loc[:, '售價'].str. contains('居屋')] property_data = property_data.drop( ['物業編號', '樓盤狀態', '瀏覽人次', '刊登或續期日', '記錄更新', '放盤到期日'], axis=1) property_data = property_data.fillna('') return property_data @staticmethod def convert_property_dict_to_str(property_dict): message_str = '' for key, value in property_dict.items(): message_str += '{}: {}\n'.format(key, value) return message_str def send_property_details(self, property_data): property_records = property_data.to_dict('records') message_list = list( map(self.convert_property_dict_to_str, property_records)) for chat_id in CHAT_ID: self.notifier.CHAT_ID = chat_id list(map(self.notifier.send_message, message_list)) def hse730_scrapping(self): check_id = self.get_check_id() url_list = self.get_search_url_list() response_list = self.get_html_result_pages(url_list) property_url_list = self.aggregate_property_url( response_list, check_id) self.append_new_check_id_list(property_url_list, check_id) response_list = self.get_property_pages(property_url_list) property_data = self.aggregate_property_table(response_list) if len(property_data) > 0: property_data = self.clean_filter_property_table(property_data) if len(property_data) > 0: self.send_property_details(property_data)