def _train_label_regressor(self): df_train = self._train_df_with_labels() df_train.dropna(subset=self.text_cols, inplace=True) logger.info(f'training with {len(df_train)} labels out of {self.labeler} ' f'due to missing data (possibly due to date filtering)') if len(df_train) >= common.MLParams.min_training_samples: df_train = self._add_relevance_features(df_train) self.regressor = regression.LGBProbaRegressionPipeline( text_cols=self.text_cols, num_cols=self.num_cols_label) model_metrics, baselines_metrics = self.regressor.train_eval( df_train, y_col=self.target_col, target_name='label', baselines=[df_train[self.keyword_score_col], df_train[self.scrape_order_rank_col]]) metric = regression.MAIN_METRIC self.model_score = model_metrics[metric] self.keyword_score = baselines_metrics[0][metric] self.scrape_order_score = baselines_metrics[1][metric] else: logger.warn(f'Not training label regressor due to ' f'having only {len(df_train)} samples')
def label_url_get(task_name, url): task = tasks[task_name] task.load_ranker() if task.ranker.busy: return _ranker_busy_page(task_name) if task.ranker_outdated(): reload_url = flask.url_for('reload_ranker', task_name=task_name) logger.info(f'ranker outdated for "{task_name}" (new data scraped)') flask.flash( flask.Markup( f'New data scraped, "Reload" to update: ' f'<a href="{reload_url}" class="alert-link">{reload_url}</a>'), 'success') url_attributes, raw_description = task.ranker.url_data(url) url_att_html = (url_attributes.drop(['url', 'title']).to_frame().to_html( header=False, justify='right')) return flask.render_template('job_page.html', task_name=task_name, job_url=url, job_title=url_attributes.get('title'), job_description=raw_description, url_data=url_att_html)
def get_crawls(cls, task_config: TaskConfig, raise_on_missing=True, ignore_empty=True, filter_relevance_date=True, ): all_crawls = [os.path.join(task_config.crawls_dir, f) for f in sorted(os.listdir(task_config.crawls_dir))] if ignore_empty: all_crawls = [path for path in all_crawls if os.stat(path).st_size] if raise_on_missing and not all_crawls: raise FileNotFoundError( f'No crawls found for task "{task_config.name}", ' f'please run scraping.') if filter_relevance_date and task_config.past_scrapes_relevance_date: filtered_crawls = cls._filter_recent(all_crawls, task_config=task_config) logger.info(f'got {len(filtered_crawls)} out of {len(all_crawls)} ' f'scrapes due to past_scrapes_relevance_date=' f'{task_config.past_scrapes_relevance_date}') return filtered_crawls else: return all_crawls
def print_top_n_features(self, x, y, n=30, target_name=''): # names if not hasattr(self.reg, 'feature_importances_'): logger.error( f"regressor {self.reg} doesn't have 'feature_importances_' attribute" ) return top_n_feat = np.argsort(self.reg.feature_importances_)[-n:] feat_names = self.transformer.get_feature_names() top_names = np.array(feat_names)[top_n_feat] # correlations x = self.transformer.transform(x) top_feat_x = x[:, top_n_feat].toarray() cors_mat, _ = scipy.stats.spearmanr(top_feat_x, y.reshape(-1, 1)) cors_vec = cors_mat[-1, 0:-1] non_zeros = top_feat_x.astype(bool).sum(0) df = pd.DataFrame( {'name': top_names, 'correlation': cors_vec, 'nonzeros': non_zeros}). \ sort_values('correlation', ascending=False) logger.info(f'Top {n} informative features and correlations to ' f'{target_name}: \n{df}')
def exhaustive_column_selection(cls, text_cols, num_cols, x, y, metric, test_ratio): res = [] x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=test_ratio or common.MLParams.test_ratio, shuffle=common.MLParams.shuffle_split) for cols in all_subsets(text_cols + num_cols): this = cls([col for col in text_cols if col in cols], [col for col in num_cols if col in cols]) this.pipe.fit(x_train[list(cols)], y_train) y_pred = this.pipe.predict(x_test[list(cols)]) test_metrics = score_metrics(y_test, y_pred) res.append((test_metrics[metric], cols)) logger.info( f'selection {test_metrics} {(test_metrics[metric], cols)}') best_cols = sorted(res)[-1][1] logger.info(f'best: {best_cols}') return cls([col for col in text_cols if col in best_cols], [col for col in num_cols if col in best_cols])
def join(self): out, err = self.subproc.communicate() out = out.strip().decode() err = err.strip().decode() if out: logger.info(f'crawl process stdout:\n{out}') if err: logger.info(f'crawl process stderr:\n{err}')
def add_label(self, url, label): if self.is_valid_label(label): self.load() self._df = self._df.append( pd.DataFrame({self.url_col: [url], self.label_col: [label], self.timestamp_col: [str(pd.datetime.now())]})) self.save() logger.info(f'Added label: {label} for {url}')
def print_metrics(self, y_test, y_pred, target_name): metrics = score_metrics(y_test, y_pred) logger.info( f"\n {pd.Series(metrics).to_frame(f'{target_name} :').transpose()}" ) binary_metrics = binary_scores(y_test, y_pred) if binary_metrics is not None: logger.info(f"{target_name}, binary scores:\n {binary_metrics}") return metrics
def _calc_duplicates(self): df_all = self.df_all_read keep_inds, dup_dict_inds = calc_duplicates( df_all[self.description_col], keep='last') urls = df_all['url'].values self.dup_dict = {urls[i]: urls[sorted([i] + list(dups))] for i, dups in dup_dict_inds.items()} # dedup by content and keep last self.df_all_deduped = df_all.iloc[keep_inds] logger.info(f'total historic jobs DF: {len(self.df_all_deduped)} ' f'(deduped from {len(df_all)})')
def _sort_jobs(self, df): sort_cols = [self.scrape_order_rank_col, self.keyword_score_col, self.model_score_col] scores = [self.scrape_order_score, self.keyword_score, self.model_score] if not any(scores): # didn't train, choose default self.sort_col = (self.keyword_score_col if self.task_config.has_keywords() else self.scrape_order_rank_col) else: self.sort_col = sort_cols[int(np.nanargmax(scores))] logger.info(f'Sorting by column: {self.sort_col} ({self.ranking_scores})') df.sort_values(self.sort_col, ascending=False, inplace=True) return df
def _set_df_recent(self): self.recent_crawl_source = CrawlsFilesDao.get_crawls( task_config=self.task_config)[-1] recent_full_df = CrawlsFilesDao.read_scrapy_file(self.recent_crawl_source) if not self.dedup_recent: self.df_recent = recent_full_df self._add_duplicates_column() else: self.df_recent = ( self.df_all_deduped.loc[self.df_all_deduped['scraped_file'] == self.recent_crawl_source, :]) unlabeled = [u for u in self.df_recent['url'] if not self.labeler.is_labeled(u)] self.df_recent = self.df_recent[self.df_recent['url'].isin(unlabeled)] logger.info(f'most recent scrape DF: ' f'{len(self.df_recent)} ({self.recent_crawl_source}, ' f'all scraped: {len(recent_full_df)})')
def fit(self, X, y, early_stopping=False): if early_stopping: x_train, x_valid, y_train, y_valid = train_test_split( X, y, test_size=0.3, shuffle=common.MLParams.shuffle_split) LGBMRegressor.fit(self, x_train, y_train, eval_metric=self.eval_metric, early_stopping_rounds=200, eval_set=(x_valid, y_valid), verbose=False) logger.info( f'LGBM early stopping: ' f'setting n_estimators to best_iteration_({self.best_iteration_})' ) self.n_estimators = self.best_iteration_ return LGBMRegressor.fit(self, X, y)
def _train_salary_regressor(self): df_train = self.df_all_deduped.copy() df_train = self._add_salary_features(df_train) target_col = 'salary_high' df_train.dropna(subset=self.text_cols + [target_col], inplace=True) logger.info(f'training with {len(df_train)} salaries') if len(df_train) >= common.MLParams.min_training_samples: self.regressor_salary = regression.LGBRegressionPipeline( text_cols=self.text_cols, num_cols=self.num_cols_salary) model_metrics, _ = self.regressor_salary.train_eval( df_train, y_col=target_col, target_name='salary') self.reg_sal_model_score = model_metrics[regression.MAIN_METRIC] else: logger.warn(f'Not training salary regressor due to ' f'having only {len(df_train)} samples')
def start(self): joined_start_urls = ','.join(self.task_config.search_urls) commands = [ 'scrapy', 'crawl', 'jora-spider', '-a', f'start_urls="{joined_start_urls}"'] for k, v in self._settings_dict().items(): commands.extend(['-s', f'{k}="{v}"']) scrapy_dir = os.path.dirname(__file__) logger.info(f"launching scrapy in dir {scrapy_dir} with:\n\t{' '.join(commands)}") self.subproc = subprocess.Popen( ' '.join(commands), shell=True, cwd=scrapy_dir, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) logger.info(f'Started scraping task "{self.task_config.name}".\n\t' f'check log file at {self.log_path}\n\t' f'output file at {self.crawl_output_path}')
def reload_ranker(task_name): task = tasks[task_name] if task.crawling: logger.info(f'not reloading ranker data because ' f'scraping is in progress: {task_name}') flask.flash(f'Scraping in progress, not reloading data.', 'warning') if task.ranker.busy: logger.info( f'not reloading ranker data because ' f'ranker is busy (reloading or recalculating): {task_name}') flask.flash(f'Ranker is busy, not reloading data.', 'warning') else: task.reload_ranker() logger.info(f'reloading ranker: {task_name}') flask.flash(f're-loading data for task "{task_name}"', 'info') return flask.redirect( flask.url_for('task_description', task_name=task_name))
def end_labeling_message(message): logger.info(message)
def skip_url(task_name, url): tasks[task_name].skip(url) logger.info(f'skip: {url} for "{task_name}"') flask.flash(f'skipped url {url} for "{task_name}"', 'warning') return flask.redirect(flask.url_for('labeling', task_name=task_name))
def recalc(task_name): tasks[task_name].recalc() logger.info(f'recalculating: {task_name}') flask.flash(f're-calculating rankings for task "{task_name}"', 'info') return flask.redirect(flask.url_for('labeling', task_name=task_name))
def scrape_start(task_name): tasks[task_name].start_crawl() logger.info(f'started scraping for {task_name}') flask.flash(f'Started scraping for task "{task_name}"', 'success') return flask.redirect(flask.url_for('scraping', task_name=task_name))