def get_news_result_cnt(self, news_url, keyword=''): config = Configuration() config.http_proxies = { 'http': self.proxy, 'https': self.proxy } config.browser_user_agent = self.ua config.stopwords_class = StopWordsChinese config.http_proxies = { 'http': self.proxy, 'https': self.proxy } g = Goose(config) article = g.extract(news_url) text_html = article.raw_html text_tree = etree.HTML(text_html) if article.cleaned_text: cont = article.cleaned_text else: cont = ''.join(text_tree.xpath('//div[@class="col-md-10 col-xs-12 detailNews"]/p//text()')).replace('\xa0', '') art_title = article.title news_post = dict( doc_id=md5(article.final_url.encode('utf-8')).hexdigest(), keyword=keyword, url=article.final_url, title=art_title, platform='news', content=cont, author=article.authors, source=self.source, published_time=int(parse(article.publish_date).timestamp() * 1000) if article.publish_date else None, spi_time=int(time.time() * 1000) ) return news_post
def get_news_result_cnt(self, news_url): config = Configuration() config.http_proxies = {'http': self.proxy, 'https': self.proxy} config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36' config.stopwords_class = StopWordsChinese config.http_proxies = {'http': self.proxy, 'https': self.proxy} g = Goose(config) article = g.extract(news_url) try: published_time = int( parse(article.publish_date).timestamp() * 1000) if article.publish_date else None except: published_time = int( time.mktime(time.strptime(article.publish_date, "%Y年%m月%d日")) * 1000) if article.publish_date else None news_post = dict(doc_id=md5( article.final_url.encode('utf-8')).hexdigest(), keyword='', url=article.final_url, title=article.title, platform='news', content=article.cleaned_text, author=article.authors, source=self.source, published_time=published_time, spi_time=int(time.time() * 1000)) return news_post
def get_news_result_cnt(self, news_url): config = Configuration() config.http_proxies = {'http': self.proxy, 'https': self.proxy} config.browser_user_agent = self.ua config.stopwords_class = StopWordsChinese config.http_proxies = {'http': self.proxy, 'https': self.proxy} g = Goose(config) article = g.extract(news_url) news_post = dict(content=article.cleaned_text, ) return news_post
def get_news_result_cnt(self, news_url): head = dict() head[ 'Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' proxy = {'http': self.proxy, 'https': self.proxy} req = requests.get(news_url, proxies=proxy, headers=head) cnt = ''.join(re.findall(r'content":"(.*?)"}', req.text, re.S) or '').replace('<br>', '').replace('\xa0', '').replace( '<br />', '').replace(' ', '').replace( '</strong>', '').replace('<strong>', '').replace('<u>', '').replace('</u>', '') if '<iframe' in cnt: cnt = ''.join(re.findall(r'(.*?)<iframe', cnt, re.S) or '') if '<div' in cnt: cnt = ''.join(re.findall(r'(.*?)<div', cnt, re.S)[0] or '') if 'allow=' in cnt: cnt = ''.join(re.findall(r'(.*?)allow=', cnt, re.S)[0] or '') config = Configuration() config.http_proxies = {'http': self.proxy, 'https': self.proxy} config.browser_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36' config.stopwords_class = StopWordsChinese config.http_proxies = {'http': self.proxy, 'https': self.proxy} g = Goose(config) article = g.extract(news_url) news_post = dict( doc_id=md5(article.final_url.encode('utf-8')).hexdigest(), keyword='', url=article.final_url, title=article.title, platform='news', content=cnt, author=article.authors, source=self.source, published_time=int(parse(article.publish_date).timestamp() * 1000) if article.publish_date else None, spi_time=int(time.time() * 1000)) return news_post
def auto_news_main_content(self, news_url, keyword=''): config = Configuration() config.http_proxies = {'http': self.proxy, 'https': self.proxy} config.browser_user_agent = self.ua config.stopwords_class = StopWordsChinese config.http_proxies = {'http': self.proxy, 'https': self.proxy} g = Goose(config) article = g.extract(news_url) news_post = dict( doc_id=md5(article.final_url.encode('utf-8')).hexdigest(), keyword=keyword, url=article.final_url, title=article.title, platform='news', content=article.cleaned_text, author=article.authors, source=self.source if self.source else article.domain, published_time=int(parse(article.publish_date).timestamp() * 1000) if article.publish_date else None, spi_time=int(time.time() * 1000)) return news_post