def __init__(self, html=None, title=" ", **kwargs): self.html = html or None self.title = title or None self.entities = [] self.keywords = [] self.names = [] self.fulltext = None self.language = None self.description = None self.canonical_url = None self.image = None self.published_date = None self.modified_date = None self.scraped_date = None self.contenthash = None self.reading_time = None config = Configuration() config.enable_image_fetching = False self.goose = Goose(config=config) self.tree = None
res = requests.get(url=url, headers=header) from lxml import etree html = etree.HTML(res.text) a_list = html.xpath('//*[@id="J_posts_list"]/tr/td[2]/p[1]/a[3]/@href') # print(len(a_list)) uri = 'https://www.cifnews.com/article/30666' g = Goose({'stopwords_class': StopWordsChinese}) article = g.extract(url=uri) config = Configuration() config.enable_image_fetching = True text = article.cleaned_text # print(text) header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' } # res = requests.get('https://waimaoquan.alibaba.com/bbs/attachment/1809/thread/309_713107_6a75458c47ea357.png', headers=header) # with open('sds.jpg', 'wb') as j: # j.write(res.content) # 要访问的目标页面 targetUrl = "http://test.abuyun.com" # 代理服务器