Python Configuration示例，goose3.Configuration Python示例

示例#1

0

显示文件

文件： spider_tjp.py 项目： vaemusician/spider_news

 def get_news_result_cnt(self, news_url, keyword=''):
     config = Configuration()
     config.http_proxies = {
         'http': self.proxy,
         'https': self.proxy
     }
     config.browser_user_agent = self.ua
     config.stopwords_class = StopWordsChinese
     config.http_proxies = {
         'http': self.proxy,
         'https': self.proxy
     }
     g = Goose(config)
     article = g.extract(news_url)
     text_html = article.raw_html
     text_tree = etree.HTML(text_html)
     if article.cleaned_text:
         cont = article.cleaned_text
     else:
         cont = ''.join(text_tree.xpath('//div[@class="col-md-10 col-xs-12 detailNews"]/p//text()')).replace('\xa0',
                                                                                                             '')
     art_title = article.title
     news_post = dict(
         doc_id=md5(article.final_url.encode('utf-8')).hexdigest(),
         keyword=keyword,
         url=article.final_url,
         title=art_title,
         platform='news',
         content=cont,
         author=article.authors,
         source=self.source,
         published_time=int(parse(article.publish_date).timestamp() * 1000) if article.publish_date else None,
         spi_time=int(time.time() * 1000)
     )
     return news_post

示例#2

0

显示文件

 def get_news_result_cnt(self, news_url):
     config = Configuration()
     config.http_proxies = {'http': self.proxy, 'https': self.proxy}
     config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
     config.stopwords_class = StopWordsChinese
     config.http_proxies = {'http': self.proxy, 'https': self.proxy}
     g = Goose(config)
     article = g.extract(news_url)
     try:
         published_time = int(
             parse(article.publish_date).timestamp() *
             1000) if article.publish_date else None
     except:
         published_time = int(
             time.mktime(time.strptime(article.publish_date, "%Y年%m月%d日")) *
             1000) if article.publish_date else None
     news_post = dict(doc_id=md5(
         article.final_url.encode('utf-8')).hexdigest(),
                      keyword='',
                      url=article.final_url,
                      title=article.title,
                      platform='news',
                      content=article.cleaned_text,
                      author=article.authors,
                      source=self.source,
                      published_time=published_time,
                      spi_time=int(time.time() * 1000))
     return news_post

示例#3

0

显示文件

    def __init__(self, html=None, title=" ", **kwargs):
        self.html = html or None
        self.title = title or None
        self.entities = []
        self.keywords = []
        self.names = []
        self.fulltext = None
        self.language = None
        self.description = None
        self.canonical_url = None
        self.image = None
        self.published_date = None
        self.modified_date = None
        self.scraped_date = None
        self.contenthash = None
        self.reading_time = None

        config = Configuration()
        config.enable_image_fetching = False
        self.goose = Goose(config=config)

        self.tree = None

示例#4

0

显示文件

文件： spider_pot.py 项目： vaemusician/spider_news

 def get_news_result_cnt(self, news_url):
     config = Configuration()
     config.http_proxies = {'http': self.proxy, 'https': self.proxy}
     config.browser_user_agent = self.ua
     config.stopwords_class = StopWordsChinese
     config.http_proxies = {'http': self.proxy, 'https': self.proxy}
     g = Goose(config)
     article = g.extract(news_url)
     news_post = dict(content=article.cleaned_text, )
     return news_post

示例#5

0

显示文件

    def get_news_result_cnt(self, news_url):
        head = dict()
        head[
            'Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
        proxy = {'http': self.proxy, 'https': self.proxy}
        req = requests.get(news_url, proxies=proxy, headers=head)
        cnt = ''.join(re.findall(r'content":"(.*?)"}', req.text, re.S)
                      or '').replace('<br>', '').replace('\xa0', '').replace(
                          '<br />', '').replace('&nbsp;', '').replace(
                              '</strong>',
                              '').replace('<strong>',
                                          '').replace('<u>',
                                                      '').replace('</u>', '')
        if '<iframe' in cnt:
            cnt = ''.join(re.findall(r'(.*?)<iframe', cnt, re.S) or '')
        if '<div' in cnt:
            cnt = ''.join(re.findall(r'(.*?)<div', cnt, re.S)[0] or '')
        if 'allow=' in cnt:
            cnt = ''.join(re.findall(r'(.*?)allow=', cnt, re.S)[0] or '')
        config = Configuration()
        config.http_proxies = {'http': self.proxy, 'https': self.proxy}
        config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
        config.stopwords_class = StopWordsChinese
        config.http_proxies = {'http': self.proxy, 'https': self.proxy}
        g = Goose(config)
        article = g.extract(news_url)

        news_post = dict(
            doc_id=md5(article.final_url.encode('utf-8')).hexdigest(),
            keyword='',
            url=article.final_url,
            title=article.title,
            platform='news',
            content=cnt,
            author=article.authors,
            source=self.source,
            published_time=int(parse(article.publish_date).timestamp() *
                               1000) if article.publish_date else None,
            spi_time=int(time.time() * 1000))

        return news_post

示例#6

0

显示文件

文件： spider.py 项目： vaemusician/spider_news

 def auto_news_main_content(self, news_url, keyword=''):
     config = Configuration()
     config.http_proxies = {'http': self.proxy, 'https': self.proxy}
     config.browser_user_agent = self.ua
     config.stopwords_class = StopWordsChinese
     config.http_proxies = {'http': self.proxy, 'https': self.proxy}
     g = Goose(config)
     article = g.extract(news_url)
     news_post = dict(
         doc_id=md5(article.final_url.encode('utf-8')).hexdigest(),
         keyword=keyword,
         url=article.final_url,
         title=article.title,
         platform='news',
         content=article.cleaned_text,
         author=article.authors,
         source=self.source if self.source else article.domain,
         published_time=int(parse(article.publish_date).timestamp() *
                            1000) if article.publish_date else None,
         spi_time=int(time.time() * 1000))
     return news_post

示例#7

0

显示文件

import requests

from os.path import join, isfile
from goose3 import Goose, Configuration
from nltk import word_tokenize
from time import sleep

path = '/Users/aadil/fake_news_detection/Snopes'
dump_path = '/Users/aadil/fake_news_detection/review_articles'

filepaths = [join(path, f) for f in os.listdir(path) if isfile(join(path, f))]
filepaths.sort()

start_time = time.time()

c = Configuration()
g = Goose(config=c)

if __name__ == '__main__':

    for i in range(2600, 3000):
        ## Closing the old goose instance and starting a new one
        if i % 20 == 0:
            print('At step .... ', i)
            g.close()
            print('Restarting goose')
            g = Goose()

        with open(filepaths[i]) as f:
            urls = []
            data = json.load(f)

示例#8

0

显示文件

import requests
import csv
from io import StringIO
from smtplib import SMTP
import json
import hashlib
from datetime import date
from goose3 import Goose, Configuration

config = Configuration()
config.http_timeout = 10
g = Goose(config)


def hash_website(url):
    # verify is False since some of the government websites don't work for some reason if verify is True
    # fix maybe in the future?
    return hashlib.sha512(g.extract(url).cleaned_text.encode("utf-8")).hexdigest()


def check_url_hash(url):
    hash_content = hash_website(url)
    return (hash_content != website_hashes[url]), hash_content


with open("config.json", "r") as f:
    info = json.load(f)
    email = info["email"]
    password = info["password"]
    recipients = info["recipients"]
    spreadsheet_link = info["spreadsheet_link"]

示例#9

0

显示文件

res = requests.get(url=url, headers=header)

from lxml import etree

html = etree.HTML(res.text)

a_list = html.xpath('//*[@id="J_posts_list"]/tr/td[2]/p[1]/a[3]/@href')

# print(len(a_list))

uri = 'https://www.cifnews.com/article/30666'

g = Goose({'stopwords_class': StopWordsChinese})
article = g.extract(url=uri)
config = Configuration()
config.enable_image_fetching = True
text = article.cleaned_text
# print(text)

header = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
# res = requests.get('https://waimaoquan.alibaba.com/bbs/attachment/1809/thread/309_713107_6a75458c47ea357.png', headers=header)
# with open('sds.jpg', 'wb') as j:
#     j.write(res.content)

# 要访问的目标页面
targetUrl = "http://test.abuyun.com"