示例#1
0
class Hse28Scrapper(Hse28Setting):
    def __init__(self):
        self.scrapper = WebScrapper()
        self.io = FileIO()
        self.notifier = TelegramNotifier()

    def _search_result_checker(self, html):
        return check_html_element_exist(html, self.RESULT_TAG)

    def _property_page_checker(self, html):
        return check_html_element_exist(html, self.PROPERTY_TAB)

    def get_last_max_id(self):
        temp_id = self.io.load_file(DATA_PATH, self.TEMP_ID_FILE, 'json')
        max_id = int(temp_id['max_id'])
        return max_id

    async def sort_filter_page(self, session):
        session = await async_fill_box(session, self.LOW_PRICE_TAG,
                                       self.LOWEST_PRICE)
        session = await async_fill_box(session, self.HIGH_PRICE_TAG,
                                       self.HIGHEST_PRICE)
        session = await async_click_button(session, self.PRICE_BUTTON_TAG,
                                           self.LOAD_TAG)
        session = await async_click_option(session, self.SORT_TAG,
                                           self.LOAD_TAG)
        return session

    def get_search_url_list(self):
        url_list = [
            self.RESULT_URL.format(district) for district in self.DISTRICT
        ]
        return url_list

    def get_html_result_pages(self, url_list):
        response_list = self.scrapper.browse_multiple_html(
            url_list,
            extra_action=self.sort_filter_page,
            html_checker=self._search_result_checker,
            asyn=True)
        return response_list

    @staticmethod
    def _get_property_id_from_url(url):
        if 'https://www.28hse.com/buy-property-' not in url:
            property_id = -1
        else:
            property_id = int(
                url.replace('https://www.28hse.com/buy-property-',
                            '').replace('.html', ''))
        return property_id

    def get_property_url_from_html(self, response, max_id):
        if response['ok']:
            soup = BeautifulSoup(response['message'], 'html.parser')
            result_list = soup.select(self.RESULT_TAG)[0]
            property_url_list = [
                property_tag.get('href')
                for property_tag in result_list.select('a')
            ]
            property_url_list = [
                property_url for property_url in property_url_list
                if self._get_property_id_from_url(property_url) > max_id
            ]
            return property_url_list
        else:
            self.notifier.send_message(
                'Unable to load result for page {}. {}'.format(
                    response['url'], response['error']))
            return []

    def aggregate_property_url(self, response_list, max_id):
        property_url_list = list(
            map(
                lambda response: self.get_property_url_from_html(
                    response, max_id), response_list))
        property_url_list = [
            url for property_url in property_url_list for url in property_url
        ]
        return property_url_list

    def get_property_pages(self, property_url_list):
        response_list = self.scrapper.load_multiple_html(
            property_url_list, html_checker=self._property_page_checker)
        return response_list

    def extract_property_pages(self, response):
        if response['ok']:
            soup = BeautifulSoup(response['message'], 'html.parser')
            property_table = pd.read_html(
                str(soup.select(self.PROPERTY_TAB)[0]))[0]
            property_table = property_table.set_index(0).T
            property_table = property_table.assign(**{'URL': response['url']})
            return property_table
        else:
            self.notifier.send_message(
                'Unable to load property for page {}. {}'.format(
                    response['url'], response['error']))
            return pd.DataFrame([])

    def aggregate_property_table(self, response_list):
        property_data_list = list(
            map(self.extract_property_pages, response_list))
        if len(property_data_list) > 0:
            property_data = pd.concat(property_data_list, sort=False)
            property_data = property_data.set_index('28HSE 樓盤編號')
        else:
            property_data = pd.DataFrame([])
        return property_data

    def clean_filter_property_table(self, property_data):
        property_data = property_data.assign(
            **{
                '售價': property_data.loc[:, '售價'].str.split('每月供款').str[0],
                '物業地址': property_data.loc[:, '物業地址'].str.replace('屋苑位置', '')
            })
        property_data = property_data.loc[
            (property_data.loc[:, '實用面積(呎)'].str.replace('[^0-9]', '').
             astype(float) >= self.SMALLEST_SIZE) |
            (property_data.loc[:, '實用面積(呎)'].isnull())]
        property_data = property_data.loc[
            (property_data.loc[:, '樓齡(年)'].str.replace('[^0-9]', '').
             astype(float) <= self.OLDEST_PROPERTY) |
            (property_data.loc[:, '樓齡(年)'].isnull())]
        property_data = property_data.loc[~property_data.loc[:, '售價'].str.
                                          contains('居屋')]
        property_data = property_data.drop(
            ['物業編號', '樓盤狀態', '瀏覽人次', '收藏人次', '刊登或續期日', '記錄更新', '放盤到期日'],
            axis=1)
        property_data = property_data.fillna('')
        return property_data

    def save_max_id(self, property_data):
        max_id = str(property_data.index.astype(int).max())
        if max_id != 'nan':
            max_id = {'max_id': max_id}
            self.io.save_file(max_id, DATA_PATH, self.TEMP_ID_FILE, 'json')

    @staticmethod
    def convert_property_dict_to_str(property_dict):
        message_str = ''
        for key, value in property_dict.items():
            message_str += '{}: {}\n'.format(key, value)
        return message_str

    def send_property_details(self, property_data):
        property_records = property_data.to_dict('records')
        message_list = list(
            map(self.convert_property_dict_to_str, property_records))
        for chat_id in CHAT_ID:
            self.notifier.CHAT_ID = chat_id
            list(map(self.notifier.send_message, message_list))

    def hse28_scrapping(self):
        max_id = self.get_last_max_id()
        url_list = self.get_search_url_list()
        response_list = self.get_html_result_pages(url_list)
        property_url_list = self.aggregate_property_url(response_list, max_id)
        response_list = self.get_property_pages(property_url_list)
        property_data = self.aggregate_property_table(response_list)
        if len(property_data) > 0:
            property_data = self.clean_filter_property_table(property_data)
        if len(property_data) > 0:
            self.save_max_id(property_data)
            self.send_property_details(property_data)
示例#2
0
class Modeller(ModellerSetting):
    def __init__(self):
        self.file_io = FileIO()
        self.visual = Visualization()
        self.logger = set_logger(LOGGER_PATH, self.LOGGER_FILE,
                                 self.LOGGER_LEVEL, __name__)
        ktf.set_session(get_session())

    def save_estimator(self, estimator, file_name):
        self.file_io.save_file(estimator, self.MODEL_PATH, file_name, 'joblib')

    def load_estimator(self, file_name):
        estimator = self.file_io.load_file(self.MODEL_PATH, file_name,
                                           'joblib')
        return estimator

    @staticmethod
    def notify_model_fitting_failure(message):
        send_message('Error in fitting model. {}'.format(message),
                     NOTIFIER_AGENT)

    def try_save_notify_exit(self,
                             func,
                             estimator,
                             x_data,
                             y_data,
                             *args,
                             model_file_name=None,
                             verb=False,
                             **kwargs):
        try:
            estimator = func(estimator, x_data, y_data, *args, **kwargs)
            if model_file_name is not None:
                self.save_estimator(estimator, model_file_name)
            if verb:
                self.logger.debug(
                    'Model {} fitting completed'.format(model_file_name))
            return estimator
        except KeyboardInterrupt:
            sys.exit()
        except Exception as e:
            self.notify_model_fitting_failure('{}. model_file_name: {}'.format(
                e, model_file_name))
            sys.exit()

    def set_feature_scaler(self, name, **kwargs):
        if name in self.SCALER_DICT:
            scaler = self.SCALER_DICT[name](**kwargs)
        else:
            self.logger.error('Unknown scaler name {}'.format(name))
            scaler = None
        return scaler

    @staticmethod
    def feature_scaling(scaler, x_data, fit=True):
        if fit:
            model = scaler.fit(x_data)
            return model
        else:
            transformed_data = scaler.transform(x_data)
            return transformed_data

    def set_estimator(self, name, **kwargs):
        if name in self.ESTIMATOR_DICT:
            estimator = self.ESTIMATOR_DICT[name](**kwargs)
        else:
            self.logger.error('Unknown estimator name {}'.format(name))
            estimator = None
        return estimator

    @staticmethod
    def _estimator_fit(estimator, x_data, y_data, **kwargs):
        estimator = estimator.fit(x_data, y_data, **kwargs)
        return estimator

    def pure_estimation(self,
                        estimator,
                        x_data,
                        y_data,
                        fit=True,
                        model_file_name=None,
                        verb=False,
                        **kwargs):
        if fit:
            estimator = self.try_save_notify_exit(
                self._estimator_fit,
                estimator,
                x_data,
                y_data,
                model_file_name=model_file_name,
                verb=verb,
                **kwargs)
            return estimator
        else:
            # ktf.get_session().run(tf.global_variables_initializer())
            estimated_data = estimator.predict(x_data)
            return estimated_data

    def model_residual(self,
                       estimator,
                       x_data,
                       y_data,
                       fit=False,
                       model_file_name=None,
                       verb=False,
                       **kwargs):
        if fit:
            estimator = self.pure_estimation(estimator, x_data, y_data, True,
                                             model_file_name, verb, **kwargs)
        estimated_data = self.pure_estimation(estimator, x_data, y_data, False,
                                              model_file_name, verb)
        residual_data = y_data - estimated_data
        return residual_data

    @staticmethod
    def metric_to_scorer(metric, **kwargs):
        return make_scorer(metric, **kwargs)

    def set_scorer(self, name, make_score=True, **kwargs):

        if name in self.SCORER_DICT:
            scorer = self.SCORER_DICT[name]
            if make_score:
                scorer = self.metric_to_scorer(
                    scorer['func'],
                    greater_is_better=scorer['greater_better'],
                    **kwargs)
            else:
                scorer = scorer['func']
        else:
            self.logger.error('Unknown scorer name {}'.format(name))
            scorer = None
        return scorer

    def model_scoring(self,
                      estimator,
                      x_data,
                      y_data,
                      metric,
                      fit=False,
                      model_file_name=None,
                      verb=False,
                      **kwargs):
        if fit:
            estimator = self.pure_estimation(estimator, x_data, y_data, True,
                                             model_file_name, verb, **kwargs)
        estimated_data = self.pure_estimation(estimator, x_data, y_data, False,
                                              model_file_name, verb)
        score = metric(y_data, estimated_data)
        return score

    @staticmethod
    def set_estimation_pipeline(scaler, estimator):
        estimator = Pipeline([('scaler', scaler), ('estimator', estimator)])
        return estimator

    def train_valid_evaluation(self,
                               estimator,
                               x_train,
                               y_train,
                               x_valid,
                               y_valid,
                               scorer,
                               fit=False,
                               model_file_name=None,
                               verb=False,
                               **kwargs):
        if fit:
            estimator = self.pure_estimation(estimator, x_train, y_train, True,
                                             model_file_name, verb, **kwargs)
        train_score = self.model_scoring(estimator, x_train, y_train, scorer,
                                         False)
        valid_score = self.model_scoring(estimator, x_valid, y_valid, scorer,
                                         False)
        score = {'train': train_score, 'valid': valid_score}
        return score

    def set_cv(self, cv_name, **kwargs):
        if cv_name in self.CV_DICT:
            cv = self.CV_DICT[cv_name](**kwargs)
        else:
            self.logger.error('Unknown scorer name {}'.format(cv_name))
            cv = None
        return cv

    @staticmethod
    def _cross_validation(estimator, x_data, y_data, scorer, **kwargs):
        estimator = cross_validate(estimator,
                                   x_data,
                                   y_data,
                                   scoring=scorer,
                                   **kwargs)
        return estimator

    def cross_validation(self,
                         estimator,
                         x_data,
                         y_data,
                         scorer,
                         model_file_name=None,
                         verb=False,
                         **kwargs):
        estimator = self.try_save_notify_exit(self._cross_validation,
                                              estimator,
                                              x_data,
                                              y_data,
                                              scorer,
                                              model_file_name=model_file_name,
                                              verb=verb,
                                              **kwargs)
        return estimator

    def validation_curve(self, estimator, x_data, y_data, para_range_dict,
                         scorer, plot_file_name, **kwargs):
        para_name = list(para_range_dict.keys())[0]
        para_values = para_range_dict[para_name]
        train_score, valid_score = validation_curve(estimator,
                                                    x_data,
                                                    y_data,
                                                    para_name,
                                                    para_values,
                                                    scoring=scorer,
                                                    **kwargs)
        train_score = np.mean(train_score, axis=1)
        valid_score = np.mean(valid_score, axis=1)
        data = pd.DataFrame({
            **para_range_dict,
            **{
                'train_score': train_score,
                'valid_score': valid_score
            }
        })
        self.visual.time_series_plot(
            plot_file_name,
            data,
            para_name, ['train_score', 'valid_score'],
            title_dict={
                'title': 'Validation curve for parameter {}'.format(para_name),
                'x_title': para_name,
                'y_title': 'score'
            })
        return data

    def learning_curve(self, estimator, x_data, y_data, scorer, plot_file_name,
                       **kwargs):
        train_sizes, train_score, valid_score = learning_curve(estimator,
                                                               x_data,
                                                               y_data,
                                                               scoring=scorer,
                                                               **kwargs)
        data = pd.DataFrame({
            'train_size': train_sizes,
            'train_score': train_score,
            'valid_score': valid_score
        })
        self.visual.time_series_plot(plot_file_name,
                                     data,
                                     'train_size',
                                     ['train_score', 'valid_score'],
                                     title_dict={
                                         'title': 'Learning curve',
                                         'x_title': 'train_size',
                                         'y_title': 'score'
                                     })
        return data

    def para_search(self,
                    estimator,
                    x_data,
                    y_data,
                    method,
                    search_para,
                    scorer,
                    model_file_name=None,
                    verb=False,
                    **kwargs):
        searcher = self.SEARCHER_DICT[method]
        estimator = searcher(estimator, search_para, scoring=scorer, **kwargs)
        estimator = self.pure_estimation(estimator, x_data, y_data, True,
                                         model_file_name, verb)
        return estimator

    def hyperopt_search(self,
                        scaler_dict,
                        estimator_dict,
                        x_data,
                        y_data,
                        method,
                        search_para,
                        scorer,
                        model_file_name=None,
                        verb=False,
                        **kwargs):

        set_feature_scaler = self.set_feature_scaler
        set_estimator = self.set_estimator
        set_estimation_pipeline = self.set_estimation_pipeline

        def hyperopt_min_func(space):
            scaler_kwargs = scaler_dict['kwargs']
            estimator_kwargs = estimator_dict['kwargs']
            for param_key, param_value in space.items():
                if 'scaler' in param_key:
                    if param_value['dtype'] is None:
                        scaler_kwargs[param_key.replace(
                            'scaler__', '')] = param_value['dist']
                    else:
                        scaler_kwargs[param_key.replace(
                            'scaler__',
                            '')] = param_value['dtype'](param_value['dist'])
                if 'estimator' in param_key:
                    if param_value['dtype'] is None:
                        estimator_kwargs[param_key.replace(
                            'estimator__', '')] = param_value['dist']
                    else:
                        estimator_kwargs[param_key.replace(
                            'estimator__',
                            '')] = param_value['dtype'](param_value['dist'])
            if 'optimizer' in estimator_kwargs and 'lr' in estimator_kwargs:
                estimator_kwargs['optimizer'] = estimator_kwargs['optimizer'](
                    lr=estimator_kwargs['lr'])
                estimator_kwargs.pop('lr')
            scaler = set_feature_scaler(scaler_dict['name'],
                                        **scaler_dict['kwargs'])
            estimator = set_estimator(estimator_dict['name'],
                                      **estimator_dict['kwargs'])
            model = set_estimation_pipeline(scaler, estimator)
            score = -cross_val_score(
                model, x_data, y_data, scoring=scorer, **kwargs).mean()
            if 'nn' in estimator_dict['name'] or 'lstm' in estimator_dict[
                    'name'] or 'cnn' in estimator_dict['name']:
                K.clear_session()
            return score

        searcher = self.SEARCHER_DICT[method]
        trials = Trials()
        best = fmin(fn=hyperopt_min_func,
                    space=search_para,
                    algo=searcher,
                    max_evals=self.HYPEROPT_MAX_ITER,
                    trials=trials)
        log = trials.trials
        self.save_estimator(log, 'tmp_hyperpt_search_log.pkl')
        best_para = {key: best[key] for key in search_para.keys()}
        self.save_estimator(best_para, 'temp_hyperopt_best_para.pkl')
        scaler_kwargs = scaler_dict['kwargs']
        estimator_kwargs = estimator_dict['kwargs']
        for param_key, param_value in best_para.items():
            if 'scaler' in param_key:
                if search_para[param_key]['dtype'] is None:
                    scaler_kwargs[param_key.replace(
                        'scaler__',
                        '')] = search_para[param_key]['choice'][param_value]
                else:
                    scaler_kwargs[param_key.replace(
                        'scaler__',
                        '')] = search_para[param_key]['dtype'](param_value)
            if 'estimator' in param_key:
                if search_para[param_key]['dtype'] is None:
                    estimator_kwargs[param_key.replace(
                        'estimator__',
                        '')] = search_para[param_key]['choice'][param_value]
                else:
                    estimator_kwargs[param_key.replace(
                        'estimator__',
                        '')] = search_para[param_key]['dtype'](param_value)
        if 'optimizer' in estimator_kwargs and 'lr' in estimator_kwargs:
            estimator_kwargs['optimizer'] = estimator_kwargs['optimizer'](
                lr=estimator_kwargs['lr'])
            estimator_kwargs.pop('lr')
        scaler = set_feature_scaler(scaler_dict['name'],
                                    **scaler_dict['kwargs'])
        estimator = set_estimator(estimator_dict['name'],
                                  **estimator_dict['kwargs'])
        estimator = set_estimation_pipeline(scaler, estimator)
        estimator = self.pure_estimation(estimator, x_data, y_data, True,
                                         model_file_name, verb,
                                         **kwargs['fit_params'])
        return estimator, log
示例#3
0
class Hse730(Hse730Setting):
    def __init__(self):
        self.scrapper = WebScrapper()
        self.io = FileIO()
        self.notifier = TelegramNotifier()

    def _search_result_checker(self, html):
        return check_html_element_exist(html, self.RESULT_TAG)

    def _property_page_checker(self, html):
        return check_html_element_exist(html, self.PROPERTY_TAB)

    def get_check_id(self):
        sent_id = self.io.load_file(DATA_PATH, self.CHECK_ID_FILE, 'json')
        return sent_id

    def get_search_url_list(self):
        url_list = [
            self.RESULT_URL.format(''.join(self.DISTRICT), page_no + 1,
                                   self.LOWEST_PRICE, self.HIGHEST_PRICE)
            for page_no in range(self.CHECK_NO_PAGE)
        ]
        return url_list

    def get_html_result_pages(self, url_list):
        response_list = self.scrapper.load_multiple_html(
            url_list, html_checker=self._search_result_checker, asyn=True)
        return response_list

    @staticmethod
    def _get_property_id_from_url(url):
        if '/buy-property-' not in url:
            property_id = -1
        else:
            property_id = int(
                url.replace('https://www.house730.com',
                            '').replace('/buy-property-',
                                        '').replace('.html', ''))
        return property_id

    def save_check_id(self, new_check_id_list):
        self.io.save_file(new_check_id_list, DATA_PATH, self.CHECK_ID_FILE,
                          'json')

    def get_property_url_from_html(self, response, check_id):
        if response['ok']:
            soup = BeautifulSoup(response['message'], 'html.parser')
            result_list = soup.select(self.RESULT_TAG)[0]
            property_url_list = [
                property_tag.get('href')
                for property_tag in result_list.select('a.name')
            ]
            property_url_list = [
                'https://www.house730.com' + property_url
                for property_url in property_url_list
                if self._get_property_id_from_url(property_url) not in check_id
            ]
            return property_url_list
        else:
            self.notifier.send_message(
                'Unable to load result for page {}. {}'.format(
                    response['url'], response['error']))
            return []

    def aggregate_property_url(self, response_list, check_id):
        property_url_list = list(
            map(
                lambda response: self.get_property_url_from_html(
                    response, check_id), response_list))
        property_url_list = [
            url for property_url in property_url_list for url in property_url
        ]
        return property_url_list

    def append_new_check_id_list(self, property_url_list, check_id):
        new_check_id_list = [
            self._get_property_id_from_url(property_url)
            for property_url in property_url_list
        ]
        new_check_id_list += check_id
        new_check_id_list = new_check_id_list[:50]
        self.save_check_id(new_check_id_list)

    def get_property_pages(self, property_url_list):
        response_list = self.scrapper.load_multiple_html(
            property_url_list, html_checker=self._property_page_checker)
        return response_list

    def extract_property_pages(self, response):
        if response['ok']:
            soup = BeautifulSoup(response['message'], 'html.parser')
            property_table = pd.read_html(
                str(soup.select(self.PROPERTY_TAB)[0]))[0]
            property_table = property_table.set_index(0).T
            property_table = property_table.assign(**{'URL': response['url']})
            return property_table
        else:
            self.notifier.send_message(
                'Unable to load property for page {}. {}'.format(
                    response['url'], response['error']))
            return pd.DataFrame([])

    def aggregate_property_table(self, response_list):
        property_data_list = list(
            map(self.extract_property_pages, response_list))
        if len(property_data_list) > 0:
            property_data = pd.concat(property_data_list, sort=False)
            property_data = property_data.set_index('House730樓盤編號')
        else:
            property_data = pd.DataFrame([])
        return property_data

    def clean_filter_property_table(self, property_data):
        property_data = property_data.assign(
            **{
                '售價': property_data.loc[:, '售價'].str.split('按揭計算機').str[0],
                '樓盤地址': property_data.loc[:, '樓盤地址'].str.replace('屋苑位置', '')
            })
        property_data = property_data.loc[
            (property_data.loc[:, '實用面積(呎)'].str.replace('[^0-9]', '').
             astype(float) >= self.SMALLEST_SIZE) |
            (property_data.loc[:, '實用面積(呎)'].isnull())]
        property_data = property_data.loc[
            (property_data.loc[:, '樓齡(年)'].str.replace('[^0-9]', '').
             astype(float) <= self.OLDEST_PROPERTY) |
            (property_data.loc[:, '樓齡(年)'].isnull())]
        property_data = property_data.loc[~property_data.loc[:, '售價'].str.
                                          contains('居屋')]
        property_data = property_data.drop(
            ['物業編號', '樓盤狀態', '瀏覽人次', '刊登或續期日', '記錄更新', '放盤到期日'], axis=1)
        property_data = property_data.fillna('')
        return property_data

    @staticmethod
    def convert_property_dict_to_str(property_dict):
        message_str = ''
        for key, value in property_dict.items():
            message_str += '{}: {}\n'.format(key, value)
        return message_str

    def send_property_details(self, property_data):
        property_records = property_data.to_dict('records')
        message_list = list(
            map(self.convert_property_dict_to_str, property_records))
        for chat_id in CHAT_ID:
            self.notifier.CHAT_ID = chat_id
            list(map(self.notifier.send_message, message_list))

    def hse730_scrapping(self):
        check_id = self.get_check_id()
        url_list = self.get_search_url_list()
        response_list = self.get_html_result_pages(url_list)
        property_url_list = self.aggregate_property_url(
            response_list, check_id)
        self.append_new_check_id_list(property_url_list, check_id)
        response_list = self.get_property_pages(property_url_list)
        property_data = self.aggregate_property_table(response_list)
        if len(property_data) > 0:
            property_data = self.clean_filter_property_table(property_data)
        if len(property_data) > 0:
            self.send_property_details(property_data)