示例#1
0
 def get_news_result_cnt(self, news_url, keyword=''):
     config = Configuration()
     config.http_proxies = {
         'http': self.proxy,
         'https': self.proxy
     }
     config.browser_user_agent = self.ua
     config.stopwords_class = StopWordsChinese
     config.http_proxies = {
         'http': self.proxy,
         'https': self.proxy
     }
     g = Goose(config)
     article = g.extract(news_url)
     text_html = article.raw_html
     text_tree = etree.HTML(text_html)
     if article.cleaned_text:
         cont = article.cleaned_text
     else:
         cont = ''.join(text_tree.xpath('//div[@class="col-md-10 col-xs-12 detailNews"]/p//text()')).replace('\xa0',
                                                                                                             '')
     art_title = article.title
     news_post = dict(
         doc_id=md5(article.final_url.encode('utf-8')).hexdigest(),
         keyword=keyword,
         url=article.final_url,
         title=art_title,
         platform='news',
         content=cont,
         author=article.authors,
         source=self.source,
         published_time=int(parse(article.publish_date).timestamp() * 1000) if article.publish_date else None,
         spi_time=int(time.time() * 1000)
     )
     return news_post
示例#2
0
 def get_news_result_cnt(self, news_url):
     config = Configuration()
     config.http_proxies = {'http': self.proxy, 'https': self.proxy}
     config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
     config.stopwords_class = StopWordsChinese
     config.http_proxies = {'http': self.proxy, 'https': self.proxy}
     g = Goose(config)
     article = g.extract(news_url)
     try:
         published_time = int(
             parse(article.publish_date).timestamp() *
             1000) if article.publish_date else None
     except:
         published_time = int(
             time.mktime(time.strptime(article.publish_date, "%Y年%m月%d日")) *
             1000) if article.publish_date else None
     news_post = dict(doc_id=md5(
         article.final_url.encode('utf-8')).hexdigest(),
                      keyword='',
                      url=article.final_url,
                      title=article.title,
                      platform='news',
                      content=article.cleaned_text,
                      author=article.authors,
                      source=self.source,
                      published_time=published_time,
                      spi_time=int(time.time() * 1000))
     return news_post
示例#3
0
 def get_news_result_cnt(self, news_url):
     config = Configuration()
     config.http_proxies = {'http': self.proxy, 'https': self.proxy}
     config.browser_user_agent = self.ua
     config.stopwords_class = StopWordsChinese
     config.http_proxies = {'http': self.proxy, 'https': self.proxy}
     g = Goose(config)
     article = g.extract(news_url)
     news_post = dict(content=article.cleaned_text, )
     return news_post
示例#4
0
    def get_news_result_cnt(self, news_url):
        head = dict()
        head[
            'Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
        proxy = {'http': self.proxy, 'https': self.proxy}
        req = requests.get(news_url, proxies=proxy, headers=head)
        cnt = ''.join(re.findall(r'content":"(.*?)"}', req.text, re.S)
                      or '').replace('<br>', '').replace('\xa0', '').replace(
                          '<br />', '').replace('&nbsp;', '').replace(
                              '</strong>',
                              '').replace('<strong>',
                                          '').replace('<u>',
                                                      '').replace('</u>', '')
        if '<iframe' in cnt:
            cnt = ''.join(re.findall(r'(.*?)<iframe', cnt, re.S) or '')
        if '<div' in cnt:
            cnt = ''.join(re.findall(r'(.*?)<div', cnt, re.S)[0] or '')
        if 'allow=' in cnt:
            cnt = ''.join(re.findall(r'(.*?)allow=', cnt, re.S)[0] or '')
        config = Configuration()
        config.http_proxies = {'http': self.proxy, 'https': self.proxy}
        config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
        config.stopwords_class = StopWordsChinese
        config.http_proxies = {'http': self.proxy, 'https': self.proxy}
        g = Goose(config)
        article = g.extract(news_url)

        news_post = dict(
            doc_id=md5(article.final_url.encode('utf-8')).hexdigest(),
            keyword='',
            url=article.final_url,
            title=article.title,
            platform='news',
            content=cnt,
            author=article.authors,
            source=self.source,
            published_time=int(parse(article.publish_date).timestamp() *
                               1000) if article.publish_date else None,
            spi_time=int(time.time() * 1000))

        return news_post
示例#5
0
 def auto_news_main_content(self, news_url, keyword=''):
     config = Configuration()
     config.http_proxies = {'http': self.proxy, 'https': self.proxy}
     config.browser_user_agent = self.ua
     config.stopwords_class = StopWordsChinese
     config.http_proxies = {'http': self.proxy, 'https': self.proxy}
     g = Goose(config)
     article = g.extract(news_url)
     news_post = dict(
         doc_id=md5(article.final_url.encode('utf-8')).hexdigest(),
         keyword=keyword,
         url=article.final_url,
         title=article.title,
         platform='news',
         content=article.cleaned_text,
         author=article.authors,
         source=self.source if self.source else article.domain,
         published_time=int(parse(article.publish_date).timestamp() *
                            1000) if article.publish_date else None,
         spi_time=int(time.time() * 1000))
     return news_post