def parse_news_text(self, page_html: str, url: str) -> dict: if self._extractor is None: config = Configuration() config.stopwords_class = StopWords config.strict = False extractor = Goose(config) self._extractor = extractor article = self._extractor.extract(raw_html=page_html) news_text = re.sub(r'\s+', r' ', article.cleaned_text) return {'url': url, 'text': news_text}
import pickle from sklearn.model_selection import train_test_split from algorithm.data_preprocess_main import FETCH_DATA, FEATURE_EXTRACTION, PREPROCESS from Utility import Utility_func import sys import os from algorithm.predictive_model import Classify import json import pandas as pd from goose3 import Goose from goose3.configuration import Configuration config = Configuration() config.strict = False # turn of strict exception handling config.browser_user_agent = 'Mozilla 5.0' # set the browser agent string config.http_timeout = 5.05 # set http timeout in seconds with Goose(config) as g: pass def getContent(url): try: content = g.extract(url) rs = { "url": url, "type": None, "title": content.title, "description": content.meta_description }