Exemplo n.º 1
0
    def _train_label_regressor(self):

        df_train = self._train_df_with_labels()

        df_train.dropna(subset=self.text_cols, inplace=True)

        logger.info(f'training with {len(df_train)} labels out of {self.labeler} '
                    f'due to missing data (possibly due to date filtering)')

        if len(df_train) >= common.MLParams.min_training_samples:
            df_train = self._add_relevance_features(df_train)

            self.regressor = regression.LGBProbaRegressionPipeline(
                text_cols=self.text_cols, num_cols=self.num_cols_label)

            model_metrics, baselines_metrics = self.regressor.train_eval(
                df_train,
                y_col=self.target_col,
                target_name='label',
                baselines=[df_train[self.keyword_score_col],
                           df_train[self.scrape_order_rank_col]])

            metric = regression.MAIN_METRIC
            self.model_score = model_metrics[metric]
            self.keyword_score = baselines_metrics[0][metric]
            self.scrape_order_score = baselines_metrics[1][metric]

        else:
            logger.warn(f'Not training label regressor due to '
                        f'having only {len(df_train)} samples')
Exemplo n.º 2
0
def label_url_get(task_name, url):
    task = tasks[task_name]
    task.load_ranker()

    if task.ranker.busy:
        return _ranker_busy_page(task_name)

    if task.ranker_outdated():
        reload_url = flask.url_for('reload_ranker', task_name=task_name)
        logger.info(f'ranker outdated for "{task_name}" (new data scraped)')
        flask.flash(
            flask.Markup(
                f'New data scraped, "Reload" to update: '
                f'<a href="{reload_url}" class="alert-link">{reload_url}</a>'),
            'success')

    url_attributes, raw_description = task.ranker.url_data(url)
    url_att_html = (url_attributes.drop(['url', 'title']).to_frame().to_html(
        header=False, justify='right'))

    return flask.render_template('job_page.html',
                                 task_name=task_name,
                                 job_url=url,
                                 job_title=url_attributes.get('title'),
                                 job_description=raw_description,
                                 url_data=url_att_html)
Exemplo n.º 3
0
    def get_crawls(cls,
                   task_config: TaskConfig,
                   raise_on_missing=True,
                   ignore_empty=True,
                   filter_relevance_date=True,
                   ):
        all_crawls = [os.path.join(task_config.crawls_dir, f)
                      for f in sorted(os.listdir(task_config.crawls_dir))]

        if ignore_empty:
            all_crawls = [path for path in all_crawls if os.stat(path).st_size]

        if raise_on_missing and not all_crawls:
            raise FileNotFoundError(
                f'No crawls found for task "{task_config.name}", '
                f'please run scraping.')

        if filter_relevance_date and task_config.past_scrapes_relevance_date:
            filtered_crawls = cls._filter_recent(all_crawls, task_config=task_config)
            logger.info(f'got {len(filtered_crawls)} out of {len(all_crawls)} '
                        f'scrapes due to past_scrapes_relevance_date='
                        f'{task_config.past_scrapes_relevance_date}')
            return filtered_crawls
        else:
            return all_crawls
Exemplo n.º 4
0
    def print_top_n_features(self, x, y, n=30, target_name=''):
        # names
        if not hasattr(self.reg, 'feature_importances_'):
            logger.error(
                f"regressor {self.reg} doesn't have 'feature_importances_' attribute"
            )
            return
        top_n_feat = np.argsort(self.reg.feature_importances_)[-n:]
        feat_names = self.transformer.get_feature_names()
        top_names = np.array(feat_names)[top_n_feat]

        # correlations
        x = self.transformer.transform(x)
        top_feat_x = x[:, top_n_feat].toarray()
        cors_mat, _ = scipy.stats.spearmanr(top_feat_x, y.reshape(-1, 1))
        cors_vec = cors_mat[-1, 0:-1]
        non_zeros = top_feat_x.astype(bool).sum(0)
        df = pd.DataFrame(
            {'name': top_names,
             'correlation': cors_vec,
             'nonzeros': non_zeros}). \
            sort_values('correlation', ascending=False)

        logger.info(f'Top {n} informative features and correlations to '
                    f'{target_name}: \n{df}')
Exemplo n.º 5
0
    def exhaustive_column_selection(cls, text_cols, num_cols, x, y, metric,
                                    test_ratio):
        res = []

        x_train, x_test, y_train, y_test = train_test_split(
            x,
            y,
            test_size=test_ratio or common.MLParams.test_ratio,
            shuffle=common.MLParams.shuffle_split)

        for cols in all_subsets(text_cols + num_cols):
            this = cls([col for col in text_cols if col in cols],
                       [col for col in num_cols if col in cols])

            this.pipe.fit(x_train[list(cols)], y_train)
            y_pred = this.pipe.predict(x_test[list(cols)])
            test_metrics = score_metrics(y_test, y_pred)

            res.append((test_metrics[metric], cols))

            logger.info(
                f'selection {test_metrics} {(test_metrics[metric], cols)}')

        best_cols = sorted(res)[-1][1]
        logger.info(f'best: {best_cols}')

        return cls([col for col in text_cols if col in best_cols],
                   [col for col in num_cols if col in best_cols])
Exemplo n.º 6
0
 def join(self):
     out, err = self.subproc.communicate()
     out = out.strip().decode()
     err = err.strip().decode()
     if out:
         logger.info(f'crawl process stdout:\n{out}')
     if err:
         logger.info(f'crawl process stderr:\n{err}')
Exemplo n.º 7
0
 def add_label(self, url, label):
     if self.is_valid_label(label):
         self.load()
         self._df = self._df.append(
             pd.DataFrame({self.url_col: [url],
                           self.label_col: [label],
                           self.timestamp_col: [str(pd.datetime.now())]}))
         self.save()
         logger.info(f'Added label: {label} for {url}')
Exemplo n.º 8
0
 def print_metrics(self, y_test, y_pred, target_name):
     metrics = score_metrics(y_test, y_pred)
     logger.info(
         f"\n {pd.Series(metrics).to_frame(f'{target_name} :').transpose()}"
     )
     binary_metrics = binary_scores(y_test, y_pred)
     if binary_metrics is not None:
         logger.info(f"{target_name}, binary scores:\n {binary_metrics}")
     return metrics
Exemplo n.º 9
0
    def _calc_duplicates(self):

        df_all = self.df_all_read

        keep_inds, dup_dict_inds = calc_duplicates(
            df_all[self.description_col], keep='last')

        urls = df_all['url'].values
        self.dup_dict = {urls[i]: urls[sorted([i] + list(dups))]
                         for i, dups in dup_dict_inds.items()}

        # dedup by content and keep last
        self.df_all_deduped = df_all.iloc[keep_inds]

        logger.info(f'total historic jobs DF: {len(self.df_all_deduped)} '
                    f'(deduped from {len(df_all)})')
Exemplo n.º 10
0
    def _sort_jobs(self, df):
        sort_cols = [self.scrape_order_rank_col,
                     self.keyword_score_col,
                     self.model_score_col]
        scores = [self.scrape_order_score,
                  self.keyword_score,
                  self.model_score]

        if not any(scores):  # didn't train, choose default
            self.sort_col = (self.keyword_score_col if self.task_config.has_keywords()
                             else self.scrape_order_rank_col)
        else:
            self.sort_col = sort_cols[int(np.nanargmax(scores))]

        logger.info(f'Sorting by column: {self.sort_col} ({self.ranking_scores})')
        df.sort_values(self.sort_col, ascending=False, inplace=True)
        return df
Exemplo n.º 11
0
    def _set_df_recent(self):
        self.recent_crawl_source = CrawlsFilesDao.get_crawls(
            task_config=self.task_config)[-1]
        recent_full_df = CrawlsFilesDao.read_scrapy_file(self.recent_crawl_source)
        if not self.dedup_recent:
            self.df_recent = recent_full_df
            self._add_duplicates_column()
        else:
            self.df_recent = (
                self.df_all_deduped.loc[self.df_all_deduped['scraped_file'] ==
                                        self.recent_crawl_source, :])
            unlabeled = [u for u in self.df_recent['url']
                         if not self.labeler.is_labeled(u)]
            self.df_recent = self.df_recent[self.df_recent['url'].isin(unlabeled)]

        logger.info(f'most recent scrape DF: '
                    f'{len(self.df_recent)} ({self.recent_crawl_source}, '
                    f'all scraped: {len(recent_full_df)})')
Exemplo n.º 12
0
        def fit(self, X, y, early_stopping=False):
            if early_stopping:
                x_train, x_valid, y_train, y_valid = train_test_split(
                    X, y, test_size=0.3, shuffle=common.MLParams.shuffle_split)

                LGBMRegressor.fit(self,
                                  x_train,
                                  y_train,
                                  eval_metric=self.eval_metric,
                                  early_stopping_rounds=200,
                                  eval_set=(x_valid, y_valid),
                                  verbose=False)

                logger.info(
                    f'LGBM early stopping: '
                    f'setting n_estimators to best_iteration_({self.best_iteration_})'
                )
                self.n_estimators = self.best_iteration_
            return LGBMRegressor.fit(self, X, y)
Exemplo n.º 13
0
    def _train_salary_regressor(self):
        df_train = self.df_all_deduped.copy()
        df_train = self._add_salary_features(df_train)

        target_col = 'salary_high'

        df_train.dropna(subset=self.text_cols + [target_col], inplace=True)
        logger.info(f'training with {len(df_train)} salaries')

        if len(df_train) >= common.MLParams.min_training_samples:
            self.regressor_salary = regression.LGBRegressionPipeline(
                text_cols=self.text_cols, num_cols=self.num_cols_salary)
            model_metrics, _ = self.regressor_salary.train_eval(
                df_train, y_col=target_col, target_name='salary')

            self.reg_sal_model_score = model_metrics[regression.MAIN_METRIC]

        else:
            logger.warn(f'Not training salary regressor due to '
                        f'having only {len(df_train)} samples')
Exemplo n.º 14
0
    def start(self):
        joined_start_urls = ','.join(self.task_config.search_urls)

        commands = [
            'scrapy', 'crawl', 'jora-spider',
            '-a', f'start_urls="{joined_start_urls}"']
        for k, v in self._settings_dict().items():
            commands.extend(['-s', f'{k}="{v}"'])

        scrapy_dir = os.path.dirname(__file__)

        logger.info(f"launching scrapy in dir {scrapy_dir} with:\n\t{' '.join(commands)}")

        self.subproc = subprocess.Popen(
            ' '.join(commands),
            shell=True,
            cwd=scrapy_dir,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )

        logger.info(f'Started scraping task "{self.task_config.name}".\n\t'
                    f'check log file at {self.log_path}\n\t'
                    f'output file at {self.crawl_output_path}')
Exemplo n.º 15
0
def reload_ranker(task_name):
    task = tasks[task_name]
    if task.crawling:
        logger.info(f'not reloading ranker data because '
                    f'scraping is in progress: {task_name}')
        flask.flash(f'Scraping in progress, not reloading data.', 'warning')
    if task.ranker.busy:
        logger.info(
            f'not reloading ranker data because '
            f'ranker is busy (reloading or recalculating): {task_name}')
        flask.flash(f'Ranker is busy, not reloading data.', 'warning')
    else:
        task.reload_ranker()
        logger.info(f'reloading ranker: {task_name}')
        flask.flash(f're-loading data for task "{task_name}"', 'info')
    return flask.redirect(
        flask.url_for('task_description', task_name=task_name))
Exemplo n.º 16
0
 def end_labeling_message(message):
     logger.info(message)
Exemplo n.º 17
0
def skip_url(task_name, url):
    tasks[task_name].skip(url)
    logger.info(f'skip: {url} for "{task_name}"')
    flask.flash(f'skipped url {url} for "{task_name}"', 'warning')
    return flask.redirect(flask.url_for('labeling', task_name=task_name))
Exemplo n.º 18
0
def recalc(task_name):
    tasks[task_name].recalc()
    logger.info(f'recalculating: {task_name}')
    flask.flash(f're-calculating rankings for task "{task_name}"', 'info')
    return flask.redirect(flask.url_for('labeling', task_name=task_name))
Exemplo n.º 19
0
def scrape_start(task_name):
    tasks[task_name].start_crawl()
    logger.info(f'started scraping for {task_name}')
    flask.flash(f'Started scraping for task "{task_name}"', 'success')
    return flask.redirect(flask.url_for('scraping', task_name=task_name))