Пример #1
0
 def get_content(self, response):
     meta = response.meta
     contents = response.xpath(
         '//div[@class="cnt_bd"]/p[not(@style)] | '
         '//div[@class="shizhendema_Aind_9810_2013120304"]/div[@class="bd"]/p[not(@style)] |'
         '//div[@id="content_area]/p"]').xpath('string()').extract()
     content = ''
     pattern = r'原标题:|原标题:'
     for i in contents:
         if re.search(pattern, i):
             continue
         else:
             content += i.strip()
     item = JoviLonglasttimeItem()
     item['article_title'] = meta['title']
     rep_content = 'var fo = createPlayer("v_player",540,400);fo.addVariable("videoId","vid");fo.addVariable("videoCenterId","bb13275ded2b46638e9ffc02983aaf38");fo.addVariable("videoType","0");fo.addVariable("videoEditMode","1");fo.addVariable("isAutoPlay","true");fo.addVariable("tai","news");fo.addVariable("languageConfig","");fo.addParam("wmode","opaque");writePlayer(fo,"embed_playerid");'
     item['article_content'] = content.replace('\n', '').replace(
         '\t',
         '').replace('\r',
                     '').replace('\xa0',
                                 '').replace('\u3000',
                                             '').replace(rep_content, '')
     item['first_tag'] = '央视新闻'
     item['second_tag'] = meta['second_tag']
     item['article_url'] = response.url
     yield item
Пример #2
0
    def get_content(self, response):
        meta = response.meta
        item = JoviLonglasttimeItem()

        contents = response.xpath(
            '//div[@id="whole_content"]/p/text()').extract()
        # header = response.xpath('//div[contain(@class,"acTxtTit ")]//span')
        content = ''
        pattern = r'/|图\d:|图文|说明:|原标题|原题|选自:|公众号|▲|文章来自|本文|来源|||来自网络|作者:|声明:|译自|如有侵权|\||编辑|编者|往期回顾|记者|点击进入|联合出品|图为|提示:|导语:|资料图|图注:'
        for i in contents:
            if re.search(pattern, i, re.S):
                continue
            else:
                content += i.strip()
        item['article_content'] = content.replace('\r', '').replace(
            '\n',
            '').replace('\t',
                        '').replace('\u3000',
                                    '').replace('\xa0',
                                                '').replace('\u200b', '')
        item['first_tag'] = '手机凤凰网'
        item['second_tag'] = meta['second_tag']
        item['update_time'] = response.xpath(
            '//div[contains(@class,"acTxtTit ")]//span[1]/text()'
        ).extract_first()
        item['source'] = response.xpath(
            '//div[contains(@class,"acTxtTit ")]//span[last()]/text()'
        ).extract_first()
        item['article_title'] = meta['title']
        item['article_url'] = response.url
        yield item
Пример #3
0
Файл: UC.py Проект: lngbll/JOVI
 def get_content(self, response):
     try:
         meta = response.meta
         item = JoviLonglasttimeItem()
         content = re.search(r'"content":"(.*?)","thumbnails"',
                             response.text, re.S).group(1)
         article_contents = etree.HTML(content).xpath('//p//text()')
         ptime = int(
             re.search(r'"publish_time":(\d+),', response.text,
                       re.S).group(1))
         publish_time = datetime.fromtimestamp(int(ptime) / 1000)
         pub_time = str(publish_time).split(' ')[0]
         article_content = ''
         pattern = r'图文|说明:|原标题|原题|选自:|公众号|▲|文章来自|本文|来源|||来自网络|作者:|声明:|译自|如有侵权|\||编辑|编者|往期回顾|记者|点击进入|联合出品|提示:|导语:'
         for i in article_contents:
             if re.search(pattern, i, re.S):
                 continue
             else:
                 article_content += i.strip()
         item['article_content'] = article_content.replace(
             '\r',
             '').replace('\n',
                         '').replace('\t',
                                     '').replace('\xa0',
                                                 '').replace('\u3000', '')
         item['first_tag'] = 'UC头条'
         item['second_tag'] = meta['third_tag']
         item['article_url'] = response.url
         item['source'] = meta['source']
         item['label'] = meta['label']
         item['update_time'] = pub_time
         item['article_title'] = meta['title']
         yield item
     except Exception:
         print('请求异常----%s' % response.url)
Пример #4
0
 def get_content(self, response):
     meta = response.meta
     current_page = response.xpath('//a[@class="cur"]')
     next_page = current_page.xpath('following-sibling::a')
     contents = response.xpath(
         '//div[@id="J-contain_detail_cnt"]//text()').extract()
     content = ''
     for i in contents:
         if re.search(
                 r'原标题:|选自:|公众号|▲|文章来自|本文|来源|||来自网络|作者:|声明:|译自|如有侵权|\||责任编辑:|编者按|往期回顾|记者|点击进入|联合出品|【精彩推荐】|·',
                 i):
             continue
         else:
             content += i.strip()
     meta['content'] += content
     if next_page:
         url = 'https://mini.eastday.com/a/' + next_page.xpath(
             '@href').extract_first()
         yield scrapy.Request(url=url, callback=self.get_content, meta=meta)
     else:
         item = JoviLonglasttimeItem()
         item['first_tag'] = meta['first_tag']
         item['second_tag'] = meta['second_tag']
         item['third_tag'] = meta['third_tag']
         item['source'] = meta['source']
         item['update_time'] = response.xpath(
             '//div[@class="fl"]/i[1]/text()').re_first(r'\d+-\d+-\d+')
         item['article_url'] = re.sub(r'-\d+', '', response.url)
         item['article_title'] = meta['title']
         item['article_content'] = meta['content'].replace(
             '\r', '').replace('\n', '').replace('\t', '').replace(
                 '\xa0', '').replace('\u3000', '').replace('\ufeff', '')
         yield item
Пример #5
0
 def get_content(self, response):
     meta = response.meta
     item = JoviLonglasttimeItem()
     item['article_url'] = response.url
     item['first_tag'] = meta['first_tag']
     item['second_tag'] = meta['second_tag']
     host = urlparse(response.url).netloc
     xpath = self.xpath.get(host)
     if xpath:
         item['article_title'] = response.xpath(
             xpath['title']).get().strip()
         ps = response.xpath(xpath['ps']).getall()
     else:
         logger.info('This URL parsing xpath is not settled:{}'.format(
             response.url))
         print(response.url)
         return
     content = ''
     for p in ps:
         if re.search(
                 r'责任编辑:|作者:|出处:|{}|来自:|来源 :|来源:|来源 : |图片来自|图片由|图:|更多精彩|请投稿至:|文|文/|编辑',
                 p):
             continue
         elif re.search(r'关注微信公众号|参考资料|声明:|原网页已经由 ZAKER 转码排版 |推荐阅读', p):
             break
         else:
             content += p.strip()
     item['article_content'] = content.replace('\n', '').replace(
         '\r', '').replace('\t', '').replace('\u3000',
                                             '').replace('\xa0', '')
     yield item
Пример #6
0
 def get_content(self, response):
     meta = response.meta
     if response.body != b'':
         contents = response.xpath(
             '//article/p//text()|//section[@class="art_pic_card art_content"]/p//text()|//div[@class="article"]/p//text()'
         ).extract()
         content = ''
         for i in contents:
             if re.search(
                     '原标题:|图片来自|图片来源|文章转自|文章来自|本文来源|本文来自|作者:|微信公众号|更多信息请关注|来源:|如有侵权|点击进入专题|作者署名|本文是|ID:|✎|文\|',
                     i):
                 continue
             else:
                 content += i.strip()
         item = JoviLonglasttimeItem()
         item['first_tag'] = meta['first_tag']
         item['second_tag'] = meta['second_tag']
         item['third_tag'] = meta['third_tag']
         item['article_url'] = response.url
         item['source'] = meta['source']
         item['update_time'] = meta['update_time']
         item['article_title'] = meta['title']
         item['article_content'] = content.replace('\r', '').replace(
             '\n', '').replace('\t', '').replace('\xa0',
                                                 '').replace('\u3000', '')
         yield item
Пример #7
0
 def get_content(self, response):
     meta = response.meta
     item = JoviLonglasttimeItem()
     # doc_info = json.loads(re.search(r'window\.yidian\.docinfo = (.*?)\n</script>',response.body.decode('utf-8'),re.S).group(1).replace('\\',''))
     artical_contents = response.xpath(
         '//div[@class="post_content"]//p//text()').extract()
     content = ''
     pattern = r'相关阅读:|图文|说明:|原标题|原题|选自:|公众号|▲|文章来自|本文|来源|||来自网络|作者:|声明:|译自|如有侵权|\||编辑|编者|往期回顾|记者|点击进入|提示:|导语:|转载联系|责编'
     for i in artical_contents:
         if re.search(pattern, i, re.S):
             continue
         elif re.search(r'相关阅读:', i):
             break
         else:
             content += i.strip()
     item['article_content'] = content.replace('\r', '').replace(
         '\t', '').replace('\n', '').replace('\xa0',
                                             '').replace('\u3000', '')
     try:
         item['article_title'] = response.xpath(
             '//div[@class="post_title"]//text()').extract_first().strip()
     except:
         item['article_title'] = ''
     item['first_tag'] = 'IT之家'
     item['second_tag'] = meta['second_tag']
     item['third_tag'] = meta['third_tag']
     item['article_url'] = response.url
     yield item
Пример #8
0
    def get_content(self, response):
        meta = response.meta
        contents = response.xpath(
            '//div[contains(@class,"page js-page")]/p//text()').extract()
        content = ''
        for i in contents:
            if re.search(
                    r'原标题:|选自:|公众号|▲|文章来自|本文|来源|||来自网络|作者:|声明:|译自|如有侵权|\||编辑:|编者按|往期回顾|记者|点击进入|联合出品|【精彩推荐】|·',
                    i):
                continue
            else:
                content += i.strip()

        item = JoviLonglasttimeItem()
        item['first_tag'] = meta['first_tag']
        item['second_tag'] = meta['second_tag']
        item['third_tag'] = meta['third_tag']
        item['label'] = ''
        item['source'] = meta['source']
        item['update_time'] = meta['update_time']
        item['article_url'] = response.url
        item['article_title'] = meta['title']
        item['article_content'] = content.replace('\r', '').replace(
            '\n', '').replace('\t', '').replace('\xa0',
                                                '').replace('\u3000', '')
        yield item
Пример #9
0
 def get_content(self, response):
     meta = response.meta
     item = JoviLonglasttimeItem()
     contents = response.xpath('//div[@id="artibody"]/p').xpath(
         'string()').extract()
     pattern = r'点击[上下]方|关注(.*?)公众号|关注(.*?)微信|↓|原文|相关阅读|原文|说明:|原标题|原题|选自:|公众号|▲|文章来自|本文|||来自网络|作者:|声明:|译自|如有侵权|\||编辑|编者|往期回顾|记者|点击进入|提示:|导语:|转载联系|责编|译者:|来源:'
     content = ''
     for i in contents:
         if re.search(pattern, i):
             continue
         else:
             content += i.strip()
     item['article_content'] = content.replace('\r', '').replace(
         '\t',
         '').replace('\n',
                     '').replace('\xa0',
                                 '').replace('\u3000',
                                             '').replace('\u200b', '')
     item['article_title'] = meta['title']
     item['article_url'] = response.url
     item['first_tag'] = '新浪游记'
     item['second_tag'] = meta['second_tag']
     item['third_tag'] = meta['third_tag']
     item['update_time'] = meta['update_time']
     item['label'] = ''
     item['source'] = ''
     yield item
Пример #10
0
 def parse_article(self, response):
     meta = response.meta
     item = JoviLonglasttimeItem()
     ps = response.xpath('//div[@id="content"]/p/text()').getall()
     content = ''.join(
         map((lambda x: x.strip().replace('\n', '').replace('\r', '')), ps))
     item['article_url'] = meta['article_url']
     item['first_tag'] = 'Zaker新闻'
     item['second_tag'] = meta['second_tag']
     item['article_title'] = meta['article_title']
     item['article_content'] = content
     yield item
Пример #11
0
 def get_content(self, response):
     meta = response.meta
     item = JoviLonglasttimeItem()
     data = json.loads(response.text)
     item['first_tag'] = '凤凰新闻'
     item['second_tag'] = meta['second_tag']
     item['article_url'] = response.url
     item['article_title'] = data.get('body').get('title')
     article_content = data.get('body').get('text')
     s = scrapy.Selector(text=article_content)
     item['article_content'] = ''.join(s.xpath('//p//text()').get_all())
     print(item)
     yield item
Пример #12
0
 def get_content(self, response):
     meta = response.meta
     jr = json.loads(response.text)
     item = JoviLonglasttimeItem()
     item['first_tag'] = '一点资讯'
     item['second_tag'] = meta['second_tag']
     item['article_url'] = response.url
     item['article_title'] = jr['documents'][0]['title']
     try:
         s = Selector(text=jr['documents'][0]['content'])
     except:
         return
     ps = s.xpath('//p[not(@class)]//text()').getall()
     content = ''.join(map((lambda x: x.strip()), ps))
     item['article_content'] = content
     yield item
Пример #13
0
 def get_content(self, response):
     meta = response.meta
     item = JoviLonglasttimeItem()
     pattern = r'http[s]*://[\S]+.ifeng.com/a/[\d]{8}/[\d]+_0.[s]*html'
     # 两种形式的url,提取规则不一样
     if re.search(pattern, response.url):
         contents = response.xpath(
             '//div[@id="main_content"]/p//text()').extract()
     else:
         try:
             allData = re.search('var allData = (.*?);\n',
                                 response.text).group(1)
             allData = json.loads(allData)
             docData = allData['docData']
             type = docData['contentData']['contentList'][-1]['type']
             if type == 'text':
                 contentData = docData['contentData']['contentList'][-1][
                     'data']
                 contents = scrapy.Selector(text=contentData).xpath(
                     '//p[not(@class)]//text()').extract()
             else:
                 contents = []
                 print('内容是视频或者图片----%s' % response.url)
         except Exception as e:
             print('可能发生跳转或者没有内容----%s' % response.url)
             print(e)
             contents = []
     pattern1 = r'编辑:|注:|关注(.*?)公众号|作者:|请关注|微信号:|本文为|未经授权|作者原创|微信公号:|微信ID:|作者简介:|原标题:|记者||编辑||来源:'
     content = ''
     for i in contents:
         if re.search(pattern1, i):
             continue
         elif re.search(r'- END -|END', i):
             break
         else:
             content += i.strip()
     item['first_tag'] = '凤凰网'
     item['second_tag'] = meta['second_tag']
     item['third_tag'] = meta['third_tag']
     item['article_url'] = response.url
     item['article_title'] = meta['title']
     item['article_content'] = content.replace('\r', '').replace(
         '\n', '').replace('\t', '').replace('\xa0',
                                             '').replace('\u3000', '')
     yield item
Пример #14
0
 def get_content(self, response):
     item = JoviLonglasttimeItem()
     meta = response.meta
     article = response.xpath(
         '//div[@class="article"]/p[not(@class)]//text()').extract()
     content = ''
     pattern = r'原标题:|特别声明:|{}'.format(meta['title'])
     for i in article:
         if re.search(pattern, i):
             continue
         else:
             content += i.strip()
     item['article_content'] = content
     item['article_title'] = meta['title']
     item['article_url'] = response.url
     item['first_tag'] = '新浪滚动'
     item['second_tag'] = '新浪滚动'
     item['third_tag'] = meta['third_tag']
     yield item
Пример #15
0
 def get_content(self, response):
     meta = response.meta
     res = response.text
     item = JoviLonglasttimeItem()
     content = re.search(r' content: \'(.*?)\'.slice\(6, -6\),', res)
     title = re.search(r' title: \'(.*?)\'.slice\(6, -6\),', res)
     second_tag = re.search(r'chineseTag: \'(.*?)\',', res)
     if content:
         content = content.group(1)[6:-6]
     else:
         log.msg('此URL没有文章----%s' % response.url, level=log.INFO)
         return
     if title:
         title = title.group(1)[6:-6]
         # log.msg(response.url,level=log.INFO)
     else:
         log.msg('此URL没有标题----%s' % response.url, level=log.INFO)
         return
     if second_tag:
         second_tag = second_tag.group(1)
     else:
         log.msg('此URL没有二级标签----%s' % response.url, level=log.INFO)
         return
     for k, j in self.HTML_entity.items():
         content = content.replace(k, j)
     # print(content)
     e = etree.HTML(content).xpath('//p//text()')
     pattern = r"图片来自|原标题:|选自:|公众号|▲|文章来自|本文|来源|||来自网络|作者:|声明:|译自|如有侵权|\||责任编辑:|编者按|往期回顾|记者|点击进入|联合出品|【精彩推荐】|·|责编|源丨|文丨|转载联系"
     article = ''
     for i in e:
         if re.search(pattern, i):
             continue
         else:
             article += i.strip()
     article = article.replace('\r', '').replace('\n', '').replace(
         '\t', '').replace('\xa0', '').replace('\u3000',
                                               '').replace('\u200b', '')
     item['article_content'] = article
     item['article_title'] = title
     item['first_tag'] = meta['first_tag']
     item['second_tag'] = second_tag
     item['article_url'] = response.url
     yield item
Пример #16
0
 def get_article(self, response):
     meta = response.meta
     item = JoviLonglasttimeItem()
     contents = response.xpath(
         '//div[@id="p-detail"]/p[not(@class)]/text()').extract()
     content = ''
     pattern = r' 策划:|撰文:'
     for i in contents:
         if re.search(pattern, i):
             continue
         else:
             content += i.strip()
     item['first_tag'] = '新华网'
     item['second_tag'] = meta['second_tag']
     item['article_url'] = response.url
     item['article_title'] = meta['article_title']
     item['article_content'] = content.replace('\r', '').replace(
         '\n', '').replace('\t', '').replace('\xa0',
                                             '').replace('\u3000', '')
     yield item
Пример #17
0
 def get_content(self, response):
     item = JoviLonglasttimeItem()
     item['article_url'] = response.url
     item['first_tag'] = '人民网'
     item['second_tag'] = self.channels.get(urlparse(response.url).netloc)
     item['article_title'] = response.xpath('//h1/text()').get()
     xpath = '//*[@id="rwb_zw"]//p//text() | //*[@class="box_con"]//p//text() |' \
             ' //*[@class="box_con w1000 clearfix"]//p//text() | ' \
             '//*[@class="content clear clearfix"]//p//text() |' \
             '//*[@class="show_text"]//p//text() |' \
             '//*[@id="p_content"]//p//text() |' \
             '//*[@class="artDet"]//p//text() |' \
             '//*[@class="text"]//p//text() |' \
             '//*[@class="text width978 clearfix"]//p//text() |' \
             '//*[@id="zoom"]//p//text() |' \
             '//*[@class="text_show"]//p//text()'
     item['article_content'] = ''.join(
         map((lambda x: x.strip()),
             response.xpath(xpath).getall()))
     yield item
Пример #18
0
 def get_content(self, response):
     meta = response.meta
     item = JoviLonglasttimeItem()
     contents = response.xpath(
         '//p[@class="text"]/text() | //h2[@class=""]/text()').extract()
     content = ''
     pattern = r'版权声明|来自:|关注:|搜索:|图:|点击播放|公众号:|文章来源'
     for i in contents:
         if re.search(pattern, i):
             continue
         else:
             content += i.strip()
     # 除了这个版权声明,没有多余的杂质,用replace去除
     # 有需要根据实际情况只设置两层tag,为了简化文档结构,改写相应的pipeline
     item['first_tag'] = '天天快报'
     item['second_tag'] = meta['second_tag']
     item['article_url'] = response.url
     item['article_title'] = meta['title']
     item['article_content'] = content.replace('\n', '').replace(
         '\t', '').replace('\r', '').replace('\u3000',
                                             '').replace('\xa0', '')
     yield item
Пример #19
0
 def get_content(self, response):
     # print(response.body.decode('utf-8'))
     meta = response.meta
     item = JoviLonglasttimeItem()
     contents = response.xpath(
         '//*[@class="article"]/p[not(@data-role)]//text()|//*[@class="article-text"]/p//text()'
     ).extract()
     article_content = ''
     pattern = r'返回搜狐|原标题:|选自:|公众号|▲|文章来自|本文|来源|||来自网络|作者:|声明:|译自|如有侵权|\||编辑:|编者按|往期回顾|记者|点击进入|联合出品|【精彩推荐】|·|导读|导言:|导读:'
     for i in contents:
         if re.search(pattern, i):
             continue
         else:
             article_content += i.strip()
     item['article_content'] = article_content.replace('\r', '').replace(
         '\n', '').replace('\t', '').replace('\u3000',
                                             '').replace('\xa0', '')
     item['article_url'] = response.url
     item['article_title'] = meta['title']
     item['first_tag'] = meta['first_tag']
     item['second_tag'] = meta['second_tag']
     item['third_tag'] = meta['third_tag']
     yield item
Пример #20
0
 def get_content(self, response):
     if response.body:
         meta = response.meta
         item = JoviLonglasttimeItem()
         url = response.url
         item['article_title'] = response.xpath('//h1/text()').extract_first()
         contents = response.xpath('//*[@class="content-article"]/p').xpath('string()').extract()
         content = ''
         for i in contents:
             if re.search(r'原标题:|图片来自|图片来源|作者:|微信公众号|更多信息请关注|来源:', i):
                 continue
             else:
                 content += i.strip()
         item['article_content'] = content.replace('\r', '').replace('\n', '').replace('\t', '').replace('\u3000',
                                                                                                         '').replace(
             '\xa0', '')
         item['article_url'] = url
         item['first_tag'] = meta['first_tag']
         item['second_tag'] = meta['second_tag']
         item['third_tag'] = meta['third_tag']
         item['update_time'] = meta['update_time']
         item['source'] = meta['source']
         yield item