def __process_goose(self): goose_config = Configuration() goose_config.browser_user_agent = 'Mozilla 5.0' goose_config.enable_image_fetching = True g = Goose(config=goose_config) try: article = g.extract(self.url) if article.top_image.src: self.images = self.get_all_images_from_example_src( article.top_image.src) except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout): return None return article
import pickle from sklearn.model_selection import train_test_split from algorithm.data_preprocess_main import FETCH_DATA, FEATURE_EXTRACTION, PREPROCESS from Utility import Utility_func import sys import os from algorithm.predictive_model import Classify import json import pandas as pd from goose3 import Goose from goose3.configuration import Configuration config = Configuration() config.strict = False # turn of strict exception handling config.browser_user_agent = 'Mozilla 5.0' # set the browser agent string config.http_timeout = 5.05 # set http timeout in seconds with Goose(config) as g: pass def getContent(url): try: content = g.extract(url) rs = { "url": url, "type": None, "title": content.title, "description": content.meta_description } except: