Пример #1
0
    def parse_news_text(self, page_html: str, url: str) -> dict:
        if self._extractor is None:
            config = Configuration()
            config.stopwords_class = StopWords
            config.strict = False

            extractor = Goose(config)
            self._extractor = extractor
        article = self._extractor.extract(raw_html=page_html)
        news_text = re.sub(r'\s+', r' ', article.cleaned_text)
        return {'url': url, 'text': news_text}
Пример #2
0
import pickle
from sklearn.model_selection import train_test_split
from algorithm.data_preprocess_main import FETCH_DATA, FEATURE_EXTRACTION, PREPROCESS
from Utility import Utility_func
import sys
import os

from algorithm.predictive_model import Classify
import json
import pandas as pd
from goose3 import Goose
from goose3.configuration import Configuration

config = Configuration()
config.strict = False  # turn of strict exception handling
config.browser_user_agent = 'Mozilla 5.0'  # set the browser agent string
config.http_timeout = 5.05  # set http timeout in seconds
with Goose(config) as g:
    pass


def getContent(url):
    try:
        content = g.extract(url)
        rs = {
            "url": url,
            "type": None,
            "title": content.title,
            "description": content.meta_description
        }