def get_news_result_cnt(self, news_url, keyword=''): config = Configuration() config.http_proxies = { 'http': self.proxy, 'https': self.proxy } config.browser_user_agent = self.ua config.stopwords_class = StopWordsChinese config.http_proxies = { 'http': self.proxy, 'https': self.proxy } g = Goose(config) article = g.extract(news_url) text_html = article.raw_html text_tree = etree.HTML(text_html) if article.cleaned_text: cont = article.cleaned_text else: cont = ''.join(text_tree.xpath('//div[@class="col-md-10 col-xs-12 detailNews"]/p//text()')).replace('\xa0', '') art_title = article.title news_post = dict( doc_id=md5(article.final_url.encode('utf-8')).hexdigest(), keyword=keyword, url=article.final_url, title=art_title, platform='news', content=cont, author=article.authors, source=self.source, published_time=int(parse(article.publish_date).timestamp() * 1000) if article.publish_date else None, spi_time=int(time.time() * 1000) ) return news_post
def get_news_result_cnt(self, news_url): config = Configuration() config.http_proxies = {'http': self.proxy, 'https': self.proxy} config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36' config.stopwords_class = StopWordsChinese config.http_proxies = {'http': self.proxy, 'https': self.proxy} g = Goose(config) article = g.extract(news_url) try: published_time = int( parse(article.publish_date).timestamp() * 1000) if article.publish_date else None except: published_time = int( time.mktime(time.strptime(article.publish_date, "%Y年%m月%d日")) * 1000) if article.publish_date else None news_post = dict(doc_id=md5( article.final_url.encode('utf-8')).hexdigest(), keyword='', url=article.final_url, title=article.title, platform='news', content=article.cleaned_text, author=article.authors, source=self.source, published_time=published_time, spi_time=int(time.time() * 1000)) return news_post
def __init__(self, html=None, title=" ", **kwargs): self.html = html or None self.title = title or None self.entities = [] self.keywords = [] self.names = [] self.fulltext = None self.language = None self.description = None self.canonical_url = None self.image = None self.published_date = None self.modified_date = None self.scraped_date = None self.contenthash = None self.reading_time = None config = Configuration() config.enable_image_fetching = False self.goose = Goose(config=config) self.tree = None
def get_news_result_cnt(self, news_url): config = Configuration() config.http_proxies = {'http': self.proxy, 'https': self.proxy} config.browser_user_agent = self.ua config.stopwords_class = StopWordsChinese config.http_proxies = {'http': self.proxy, 'https': self.proxy} g = Goose(config) article = g.extract(news_url) news_post = dict(content=article.cleaned_text, ) return news_post
def get_news_result_cnt(self, news_url): head = dict() head[ 'Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' proxy = {'http': self.proxy, 'https': self.proxy} req = requests.get(news_url, proxies=proxy, headers=head) cnt = ''.join(re.findall(r'content":"(.*?)"}', req.text, re.S) or '').replace('<br>', '').replace('\xa0', '').replace( '<br />', '').replace(' ', '').replace( '</strong>', '').replace('<strong>', '').replace('<u>', '').replace('</u>', '') if '<iframe' in cnt: cnt = ''.join(re.findall(r'(.*?)<iframe', cnt, re.S) or '') if '<div' in cnt: cnt = ''.join(re.findall(r'(.*?)<div', cnt, re.S)[0] or '') if 'allow=' in cnt: cnt = ''.join(re.findall(r'(.*?)allow=', cnt, re.S)[0] or '') config = Configuration() config.http_proxies = {'http': self.proxy, 'https': self.proxy} config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36' config.stopwords_class = StopWordsChinese config.http_proxies = {'http': self.proxy, 'https': self.proxy} g = Goose(config) article = g.extract(news_url) news_post = dict( doc_id=md5(article.final_url.encode('utf-8')).hexdigest(), keyword='', url=article.final_url, title=article.title, platform='news', content=cnt, author=article.authors, source=self.source, published_time=int(parse(article.publish_date).timestamp() * 1000) if article.publish_date else None, spi_time=int(time.time() * 1000)) return news_post
def auto_news_main_content(self, news_url, keyword=''): config = Configuration() config.http_proxies = {'http': self.proxy, 'https': self.proxy} config.browser_user_agent = self.ua config.stopwords_class = StopWordsChinese config.http_proxies = {'http': self.proxy, 'https': self.proxy} g = Goose(config) article = g.extract(news_url) news_post = dict( doc_id=md5(article.final_url.encode('utf-8')).hexdigest(), keyword=keyword, url=article.final_url, title=article.title, platform='news', content=article.cleaned_text, author=article.authors, source=self.source if self.source else article.domain, published_time=int(parse(article.publish_date).timestamp() * 1000) if article.publish_date else None, spi_time=int(time.time() * 1000)) return news_post
import requests from os.path import join, isfile from goose3 import Goose, Configuration from nltk import word_tokenize from time import sleep path = '/Users/aadil/fake_news_detection/Snopes' dump_path = '/Users/aadil/fake_news_detection/review_articles' filepaths = [join(path, f) for f in os.listdir(path) if isfile(join(path, f))] filepaths.sort() start_time = time.time() c = Configuration() g = Goose(config=c) if __name__ == '__main__': for i in range(2600, 3000): ## Closing the old goose instance and starting a new one if i % 20 == 0: print('At step .... ', i) g.close() print('Restarting goose') g = Goose() with open(filepaths[i]) as f: urls = [] data = json.load(f)
import requests import csv from io import StringIO from smtplib import SMTP import json import hashlib from datetime import date from goose3 import Goose, Configuration config = Configuration() config.http_timeout = 10 g = Goose(config) def hash_website(url): # verify is False since some of the government websites don't work for some reason if verify is True # fix maybe in the future? return hashlib.sha512(g.extract(url).cleaned_text.encode("utf-8")).hexdigest() def check_url_hash(url): hash_content = hash_website(url) return (hash_content != website_hashes[url]), hash_content with open("config.json", "r") as f: info = json.load(f) email = info["email"] password = info["password"] recipients = info["recipients"] spreadsheet_link = info["spreadsheet_link"]
res = requests.get(url=url, headers=header) from lxml import etree html = etree.HTML(res.text) a_list = html.xpath('//*[@id="J_posts_list"]/tr/td[2]/p[1]/a[3]/@href') # print(len(a_list)) uri = 'https://www.cifnews.com/article/30666' g = Goose({'stopwords_class': StopWordsChinese}) article = g.extract(url=uri) config = Configuration() config.enable_image_fetching = True text = article.cleaned_text # print(text) header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' } # res = requests.get('https://waimaoquan.alibaba.com/bbs/attachment/1809/thread/309_713107_6a75458c47ea357.png', headers=header) # with open('sds.jpg', 'wb') as j: # j.write(res.content) # 要访问的目标页面 targetUrl = "http://test.abuyun.com"