示例#1
0
    def __init__(self, html=None, title=" ", **kwargs):
        self.html = html or None
        self.title = title or None
        self.entities = []
        self.keywords = []
        self.names = []
        self.fulltext = None
        self.language = None
        self.description = None
        self.canonical_url = None
        self.image = None
        self.published_date = None
        self.modified_date = None
        self.scraped_date = None
        self.contenthash = None
        self.reading_time = None

        config = Configuration()
        config.enable_image_fetching = False
        self.goose = Goose(config=config)

        self.tree = None
示例#2
0
res = requests.get(url=url, headers=header)

from lxml import etree

html = etree.HTML(res.text)

a_list = html.xpath('//*[@id="J_posts_list"]/tr/td[2]/p[1]/a[3]/@href')

# print(len(a_list))

uri = 'https://www.cifnews.com/article/30666'

g = Goose({'stopwords_class': StopWordsChinese})
article = g.extract(url=uri)
config = Configuration()
config.enable_image_fetching = True
text = article.cleaned_text
# print(text)

header = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
# res = requests.get('https://waimaoquan.alibaba.com/bbs/attachment/1809/thread/309_713107_6a75458c47ea357.png', headers=header)
# with open('sds.jpg', 'wb') as j:
#     j.write(res.content)

# 要访问的目标页面
targetUrl = "http://test.abuyun.com"

# 代理服务器