示例#1
0
    def parse_content(self, response):
        def deal_id(id_raw):
            return hashlib.md5(id_raw).hexdigest()

        print(response.url)
        content_loader = ItemLoader(response=response,
                                    item=YfspiderspeakItem())
        content_loader.add_value('url', response.url)
        content_loader.add_value('spider_time', time.time())

        content_loader.add_xpath(
            'title',
            '//div[@id="content"]/div[@id="content-main"]//div[@class="entry clearfix"]//h1[@class="post-title entry-title"]//text()',
            lambda x: x[0].strip() if x else None)
        content_loader.add_xpath(
            'content',
            '//div[@id="content"]/div[@id="content-main"]//div[@class="entry clearfix"]/div[@class="entry-content clearfix"]/p//text()',
            Join())
        content_loader.add_xpath(
            'publish_time',
            '//div[@id]/div[@class="entry clearfix"]/div[@class="date updated alpha with-year"]/span/@title',
            lambda x: x[0].replace('T', ' ') + ':00' if x else None)
        content_loader.add_value('id',
                                 response.url.strip('/').split('/')[-1],
                                 deal_id)
        content_loader.add_xpath('img_urls',
                                 '//div[@id="content-main"]//img/@src')
        content_loader.add_xpath(
            'video_urls',
            '//div[@id="content"]//div[@class="hentry-container clear"]//iframe/@src'
        )

        item1 = content_loader.load_item()
        return item1
示例#2
0
    def parse_content(self,response):
        print ('in parseMore')


        def deal_publish_time(publish_time_list=[]):
            if publish_time_list:
                publish_time_str=publish_time_list[0]
            else:
                return '2018-02-01 00:00:00'
            if '+' in publish_time_str:
                publish_time_str_split=publish_time_str.split('+')[0]
                return publish_time_str_split.replace('T',' ')
            else:
                return '2018-02-01 00:00:00'



        loader1=ItemLoader(item=YfspiderspeakItem(),response=response)
        loader1.add_value('url',response.url)
        loader1.add_value('spider_time',time.time())
        loader1.add_xpath('title','//div[@id="content"]//h1[@class="pg-title"]/text()',lambda x:''.join([y for y in x]))
        loader1.add_xpath('content','//div[@id="content"]//div[@id="article-content"]/div[@class="wsw"]//text()',lambda x:[i.strip() for i in x],Join())
        loader1.add_value('id',response.url.strip('/').split('/')[-1])
        loader1.add_xpath('img_urls','//div[@id="content"]//div[@id="article-content"]/div[@class="wsw"]//img/@src')
        loader1.add_xpath('publish_time','//div[@id="content"]//div[@class="published"]//time/@datetime',deal_publish_time)
        # loader1.add_xpath('publish_user','//article//time[@class="published"]/a[@class="fn"]/text()')
        #https://www.voachinese.com/comments/a4392613p0.html



        item=loader1.load_item()
        print (item)
        return item
示例#3
0
    def parse_content(self,response):
        print ('in parseMore')


        def deal_publish_time(publish_time_list=[]):
            if publish_time_list:
                publish_time_str=''.join(publish_time_list).strip()
            else:
                return '2018-02-01 00:00:00'
            try:
                return publish_time_str+' 00:00:00'
            except:
                return '2018-02-01 00:00:00'



        loader1=ItemLoader(item=YfspiderspeakItem(),response=response)
        loader1.add_value('url',response.url)
        loader1.add_value('spider_time',time.time())
        loader1.add_xpath('title','//div[@id="abovefold"]//div[@id="storypagemaincol"]//h1/text()',lambda x:''.join([y for y in x]))
        loader1.add_xpath('content','//div[@id="abovefold"]//div[@id="storypagemaincol"]//div[@id="storytext"]//text()',lambda x:[i.strip() for i in x],Join())
        loader1.add_value('id',response.url.strip('/').split('/')[-1])
        loader1.add_xpath('img_urls','//div[@id="abovefold"]//div[@id="storypagemaincol"]//div[@id="storytext"]//img/@src',lambda x:[y for y in x if 'icon-' not in y])
        loader1.add_xpath('publish_time','//div[@id="abovefold"]//div[@id="storypagemaincol"]//div[@id="storytop"]//span[@id="story_date"]//text()',deal_publish_time)
        # loader1.add_xpath('publish_user','//article//time[@class="published"]/a[@class="fn"]/text()')



        item=loader1.load_item()
        print (item)
        return item
示例#4
0
    def parse_content(self, response):
        print('in parseMore')

        loader1 = ItemLoader(item=YfspiderspeakItem(), response=response)
        loader1.add_value('url', response.url)
        loader1.add_value('spider_time', time.time())
        loader1.add_xpath(
            'title',
            '//div[@class="page"]//div[@class="subject_bg1 nav"]//text()',
            lambda x: ''.join([y for y in x]))
        loader1.add_xpath(
            'content',
            '//div[@class="page"]//div[@class="topic_body"]//text()',
            lambda x: [i.strip() for i in x], Join())
        loader1.add_value('id', response.url.split('t=')[-1].split('&')[0])
        loader1.add_xpath(
            'img_urls',
            '//div[@class="page"]//div[@class="topic_body"]//img/@src')
        loader1.add_value('publish_time', '2018-02-01 00:00:00')
        loader1.add_value('publish_user', 'dalailamaworld')
        loader1.add_xpath(
            'video_urls',
            '//div[@class="page"]//div[@class="topic_body"]//iframe/@src')

        item = loader1.load_item()
        print(item)
        return item
示例#5
0
    def parse_content(self, response):
        print(response.url)

        def deal_img_urls(img_urls_raw):
            img_urls_dealed = []
            for one_url in img_urls_raw:
                if 'download-pdf' in one_url:
                    continue
                if 'http' not in one_url:
                    one_url_dealed = urljoin('http://www.savetibet.org/',
                                             one_url)
                    img_urls_dealed.append(one_url_dealed)

            return img_urls_dealed

        if response.xpath(
                '//div[@id="content"]//div[@id="main"]//h1[@class="title"]'):
            #表明有title这个标签,版式就是统一的

            # print response.xpath('//div[@id="content"]//div[@id="main"]//h1[@class="title"]/text()').extract()
            content_laoder = ItemLoader(response=response,
                                        item=YfspiderspeakItem())
            content_laoder.add_value('url', response.url)
            content_laoder.add_value('spider_time', time.time())

            content_laoder.add_xpath(
                'title',
                '//div[@id="content"]//div[@id="main"]//h1[@class="title"]//text()',
                lambda x: x[0].strip())
            content_laoder.add_xpath(
                'content',
                '//div[@id="content"]//div[@id="main"]//div[@class="entry"]//text()',
                Join())
            content_laoder.add_xpath(
                'img_urls', '//div[@id="main"]//div[@class="entry"]//img/@src',
                deal_img_urls)
            content_laoder.add_xpath('video_urls',
                                     '//div[@class="entry"]//iframe/@src')

            content_laoder.add_value('id',
                                     response.url.strip('/').split('/')[-1])
            if response.xpath('//div[@class="post-meta"]'):
                content_laoder.add_value(
                    'publish_time',
                    response.xpath(
                        '//div[@class="post-meta"]//abbr[@class="date time published"]/@title'
                    ).re('\d{4}\-\d{2}-\d{2}T\d{2}\:\d{2}:\d{2}'),
                    lambda x: x[0].replace('T', ' ') if x else None)
                content_laoder.add_xpath(
                    'publish_user',
                    '//div[@class="post-meta"]/span[@class="author vcard"]/span[@class="fn"]/a/text()'
                )
                # content_laoder.add_xpath('')
            else:
                content_laoder.add_value('publish_time', '1111-11-11 11:11:11')

            item1 = content_laoder.load_item()
            return item1
        else:
            print('no,it not content page')
示例#6
0
    def parse_content(self,response):
        print ('in parseMore')


        def deal_publish_time(publish_time_list=[]):
            if publish_time_list:
                publish_time_str=publish_time_list[0]
            else:
                return '2018-02-01 00:00:00'
            if '+' in publish_time_str:
                publish_time_str_split=publish_time_str.split('+')[0]
                return publish_time_str_split.replace('T',' ')
            else:
                return '2018-02-01 00:00:00'



        loader1=ItemLoader(item=YfspiderspeakItem(),response=response)
        loader1.add_value('url',response.url)
        loader1.add_value('spider_time',time.time())
        loader1.add_xpath('title','//main//div[@class="bd"]//h1[contains(@class,"title")]//text()',lambda x:''.join([y for y in x]))
        loader1.add_xpath('content','//main//div[@id="artbody"]/p//text()',lambda x:[i.strip() for i in x],Join())
        loader1.add_value('id',response.url.strip('/').split('/')[-1].split('.')[0])
        loader1.add_xpath('img_urls','//main//div[@id="artbody"]/p//img/@src')
        loader1.add_xpath('publish_time','//main//div[@id="artbody"]//time/@datetime',deal_publish_time)
        # loader1.add_xpath('publish_user','//article//time[@class="published"]/a[@class="fn"]/text()')



        item=loader1.load_item()
        print (item)
        return item
示例#7
0
    def parse_photo(self, response):
        def deal_img_urls(img_urls):
            img_result = []
            for one_img in img_urls:
                img_urls = urljoin('http://www.tibetswiss.ch/', one_img)
                img_result.append(img_urls)
            return img_result

        print(response.url)
        # print 'in photo'
        content_loader = ItemLoader(response=response,
                                    item=YfspiderspeakItem())
        content_loader.add_value('url', response.url)
        content_loader.add_value('spider_time', time.time())

        content_loader.add_xpath(
            'title',
            '//div[@id="main"]//div[@class="inside"]//div[@class="title"]/h1/text()'
        )
        # content_loader.add_xpath('content', '//div[@id="main"]/div[@class="inside"]//div[@class="ce_text"]//p/text()',
        #                          Join())
        content_loader.add_xpath(
            'img_urls', '//div[@id="main"]//div[@class="inside"]//img/@src',
            deal_img_urls)
        content_loader.add_value('publish_time', '1111-11-11 11:11:11')
        content_loader.add_value(
            'id',
            response.url.strip('/').split('/')[-1].split('.html')[0])
        item1 = content_loader.load_item()

        return item1
示例#8
0
    def parse_content(self,response):
        print ('in parseMore')

        def deal_publish_time(publish_time_list=[]):
            if publish_time_list:
                publish_time_str=publish_time_list[0]
            else:
                return '2018-02-01 00:00:00'
            return publish_time_str.strip()

        loader1=ItemLoader(item=YfspiderspeakItem(),response=response)
        loader1.add_value('url',response.url)
        loader1.add_value('spider_time',time.time())
        loader1.add_xpath('title','//div[@class="main"]//div[@class="dia-lead"]//h1/text()',lambda x:''.join([y for y in x]))
        loader1.add_xpath('content','//div[@class="main"]//div[@class="dia-lead-one"]/div[@id]//text()',lambda x:[i.strip() for i in x],Join())
        loader1.add_value('id',response.url.strip('/').split('/')[-1].split('.')[0])
        loader1.add_xpath('img_urls','//div[@class="main"]//div[@class="dia-lead-one"]/div[@id]//img/@src   ')
        loader1.add_xpath('publish_time','//div[@class="main"]//div[@class="dia-lead"]//div[@class="sign1"]/div[@class="r"]/text()',deal_publish_time)




        item=loader1.load_item()
        print (item)
        return item

    # def parse_comments(self,response):
    #     pass
示例#9
0
    def parse_content(self,response):
        print ('in parseMore')


        def deal_publish_time(publish_time_list=[]):
            try:
                year=publish_time_list[0]
                mounth=publish_time_list[1]
                days=publish_time_list[2]

                return str(year)+'-'+str(mounth)+'-'+str(days)+' 00:00:00'

            except Exception as e:
                return '2018-02-01 00:00:00'



        loader1=ItemLoader(item=YfspiderspeakItem(),response=response)
        loader1.add_value('url',response.url)
        loader1.add_value('spider_time',time.time())
        loader1.add_xpath('title','//div[@id="container"]//div[@id="main"]//h1[@class="entry_title"]//text()',lambda x:''.join([y for y in x]).strip())
        loader1.add_xpath('content','//div[@id="container"]//div[@id="main"]//p/text()',lambda x:[i.strip() for i in x],Join())
        loader1.add_value('id',response.url.strip('/').split('/')[-1].split('.')[0])
        loader1.add_xpath('img_urls','//div[@id="container"]//div[@id="main"]//img/@src')
        loader1.add_xpath('video_urls','//div[@id="container"]//div[@id="main"]//p//iframe[@allow]/@src')
        loader1.add_value('publish_time',response.xpath('//div[@id="container"]//div[@id="main"]//div[@class="singlepostmeta"]//text()').re('(\d{4})\/(\d{2})\/(\d{2})'),deal_publish_time)
        # loader1.add_xpath('read_count','//div[@align="center"]//td[@align="right"]//font[@color="red"]/text()')
        loader1.add_xpath('publish_user','//div[@id="container"]//div[@id="main"]//a[@rel="author"]//text()',lambda x:''.join([y for y in x]))



        item=loader1.load_item()
        print (item)
        return item
示例#10
0
    def parse_content(self,response):
        print ('in parseMore')


        def deal_publish_time(publish_time_list=[]):
            if publish_time_list:
                publish_time_str=publish_time_list[0]
            else:
                return '2018-02-01 00:00:00'
            try:
                zangwendict3 = {
                    u"༧": u"7",
                    u"༦": u"6",
                    u"༥": u"5",
                    u"༤": u"4",
                    u"༣": u"3",
                    u"༢": u"2",
                    u"༡": u"1",
                    u"༠": u"0",
                    u"༩": u"9",
                    u"༨": u"8"
                }
                for onekey in zangwendict3.keys():
                    if onekey in publish_time_list:
                        publish_time_str = publish_time_str.replace(onekey, zangwendict3[onekey])

                Re_find_time=re.compile(r'(\d{1,2}).*?(\d{1,2}).*?(\d{4})')
                publish_time_1=Re_find_time.findall(publish_time_str)#[(u'5', u'19', u'2018')]
                publish_time_2=publish_time_1[0]
                mounth=str(publish_time_2[0])
                days=str(publish_time_2[1])
                year=str(publish_time_2[2])

                if len(mounth)<2:
                    mounth='0'+mounth
                if len(days)<2:
                    days='0'+days
                return year+'-'+mounth+'-'+days+' 00:00:00'


            except:
                return '2018-02-01 00:00:00'



        loader1=ItemLoader(item=YfspiderspeakItem(),response=response)
        loader1.add_value('url',response.url)
        loader1.add_value('spider_time',time.time())
        loader1.add_xpath('title','//main//div[@class="page-header"]//h1//text()',lambda x:''.join([y for y in x]))
        loader1.add_xpath('content','//main//div[@class="entry-content"]//p//text()',lambda x:[i.strip() for i in x],Join())
        loader1.add_value('id',response.url.strip('/').split('/')[-1])
        loader1.add_xpath('img_urls','//main//div[@class="entry-content"]//p//img/@src')
        loader1.add_xpath('publish_time','//header//div[@class="single_meta_item single_meta_date"]//text()',deal_publish_time)
        loader1.add_xpath('publish_user','//header//div[@id="single_byline"]//text()[2]')



        item=loader1.load_item()
        print (item)
        return item
示例#11
0
    def parse_content(self, response):
        print('in parseMore')

        def deal_publish_time(publish_time_list):  #20180502
            if not publish_time_list:
                return '2018-05-09 00:00:00'
            else:
                publish_time_list = publish_time_list.replace('-', '')
                year = publish_time_list[0:4]
                mounth = publish_time_list[4:6]

                day = publish_time_list[6:8]
                return year + '-' + mounth + '-' + day + ' 00:00:00'

        def deal_read_count(read_count=None):
            if read_count:
                return int(str(''.join(read_count)))
            else:
                return 0

        def deal_img_urls(img_urls_raw):
            urlList = []
            for one_img_url in img_urls_raw:
                if 'printButton' in img_urls_raw or 'emailButton' in one_img_url:
                    continue
                if 'https://www.kagyuoffice.org.tw' not in one_img_url:
                    if one_img_url.startswith('/'):
                        one_img_url_dealed = 'https://www.kagyuoffice.org.tw' + one_img_url
                        urlList.append(one_img_url_dealed)

        loader1 = ItemLoader(item=YfspiderspeakItem(), response=response)
        loader1.add_value('url', response.url)
        loader1.add_value('spider_time', time.time())
        loader1.add_xpath(
            'title',
            '//div[@id="gj-main-content"]//div[@class="span9"]//div[@class="item-page"]//h2//a//text()',
            lambda x: ''.join([y for y in x]))
        loader1.add_xpath(
            'content',
            '//div[@id="gj-main-content"]//div[@class="span9"]//div[@class="item-page"]/p//text()',
            lambda x: [i.strip() for i in x], Join())
        loader1.add_value('id', response.url.split('news/')[-1])
        loader1.add_xpath(
            'img_urls',
            '//div[@id="gj-main-content"]//div[@class="span9"]//div[@class="item-page"]//img/@src',
            deal_img_urls)
        loader1.add_value('publish_time',
                          response.url.split('news/')[-1], deal_publish_time)
        loader1.add_value('publish_user', 'kagyuoffice')
        loader1.add_value(
            'read_count',
            response.xpath(
                '//div[@class="item-page"]//dd[@class="gj-hits"]//text()').re(
                    '.*?(\d*)'), deal_read_count)

        item = loader1.load_item()
        print(item)
        return item
示例#12
0
    def parse_content(self,response):
        print (response.url)

        def deal_img_urls(img_url_list):
            # for one_img_url in img_url_list:
            #     print (one_img_url)
            return img_url_list

        def deal_publish_time(publish_time_raw_list):
            try:
                year=str(publish_time_raw_list[0])
                mounth=str(publish_time_raw_list[1]) if len(str(publish_time_raw_list[1]))==2 else '0'+str(publish_time_raw_list[1])
                days=str(publish_time_raw_list[2]) if len(str(publish_time_raw_list[2]))==2 else '0'+str(publish_time_raw_list[2])

                hourse=str(publish_time_raw_list[3])
                minite=str(publish_time_raw_list[4])

                publish_time=year+'-'+mounth+'-'+days+' '+hourse+':'+minite+':00'
                return publish_time
            except Exception as e:
                print(e)

        def deal_reply_nodes(response_url):
            # for one_reply_nodes in reply_nodes:
            #     one_reply_nodes.xpath('')
            # 这里边的评论需要重新发起请求,所以这里全部设置成连接,后期的处理中再生成对应的reply_nodes。------mark!
            reply_id=response_url.split('/')[-1].split('?')[0]
            reply_url='http://www.ftchinese.com/index.php/c/newcomment/'+reply_id+'?v=1'
            return reply_url

        def deal_publish_user(publisher_list):
            publish_user_list=[]
            for one_user in publisher_list:
                _=one_user.strip()
                publish_user_list.append(_)
            return publish_user_list


        if not response.xpath('//span[@class="story-time"]/text()').re('(\d{4}).(\d{1,2}).(\d{1,2}). (\d{1,2})\:(\d{1,2})'):
            return #charge the content is empty by this?



        loader1 = ItemLoader(item=YfspiderspeakItem(), response=response)
        loader1.add_value('url', response.url)
        loader1.add_value('spider_time', time.time())
        loader1.add_xpath('title','//h1[@class="story-headline"]/text()',TakeFirst())
        # loader1.add_xpath('abstract','//div[@class="story-lead"]/text()')#没有abstract这个字段
        loader1.add_value('id',response.url.split('/')[-1].split('?')[0])
        loader1.add_value('img_urls',response.xpath('//div[@class="story-container"]//img/@src|//div[@class="story-container"]//figure/@data-url').extract(),deal_img_urls)
        loader1.add_xpath('content','//div[@class="story-body"]//p//text()',Join())
        loader1.add_value('publish_time',response.xpath('//span[@class="story-time"]/text()').re('(\d{4}).(\d{1,2}).(\d{1,2}). (\d{1,2})\:(\d{1,2})'),deal_publish_time)
        loader1.add_xpath('publish_user','//span[@class="story-author"]/a/text()',deal_publish_user)
        loader1.add_value('reply_count',response.xpath('//div[@id="allcomments"]/div[@class="commentcontainer"]'),lambda x:len(x))
        # loader1.add_value('reply_nodes',response.url,deal_reply_nodes)

        item1=loader1.load_item()
        return item1
示例#13
0
    def parse_content(self, response):
        def deal_publish_time(publish_time_raw):
            publish_time_str = publish_time_raw.pop().strip()
            if len(publish_time_str) == 16:
                publish_time_str1 = publish_time_str + ':00'
            else:
                publish_time_str1 = publish_time_str
            return publish_time_str1

        def deal_content(content_list_raw):
            content_str = ''
            for one_content in content_list_raw:
                content_str += one_content.strip()
            return content_str

        def deal_img_urls(img_urls_raw):
            img_url_list = []
            for one_img_url in img_urls_raw:
                if 'http' not in one_img_url:
                    one_img_url_str = 'http:' + one_img_url
                    img_url_list.append(one_img_url_str)
            return img_url_list

        def deal_publish_user(publish_user_raw):
            if publish_user_raw:
                publish_user_raw = publish_user_raw[0].strip()
                publish_user_list = publish_user_raw.split(' ')
                return publish_user_list

        print('in parseMore')
        loader1 = ItemLoader(item=YfspiderspeakItem(), response=response)
        loader1.add_value('url', response.url)
        loader1.add_value('spider_time', time.time())
        loader1.add_value('id', response.url.split('/')[-1].split('.')[0])
        loader1.add_xpath('title',
                          '//div[@class="col-left"]/div/center/h1/text()',
                          lambda x: x[0].strip())
        loader1.add_value(
            'publish_time',
            response.xpath(
                '//div[@class="col-left"]/div/div[@class="fontsize"]/div/h2').
            re('(\d{4}\-\d{2}\-\d{2} \d{2}\:\d{2})'), deal_publish_time)
        loader1.add_xpath(
            'content',
            '//div[@class="article"]/div[@class="article_right"]/p/text()',
            deal_content)
        loader1.add_value(
            'img_urls',
            response.xpath('//div[@class="article"]//img/@src').extract(),
            deal_img_urls)
        loader1.add_value(
            'publish_user',
            response.xpath('//div[@class="col-left"]//div[@class="fontsize"]').
            re(r'作者:(.*)\n?'), deal_publish_user)

        item1 = loader1.load_item()
        return item1
示例#14
0
    def parse_content(self, response):
        print('in parseMore')

        def deal_publish_time(publish_time_str):
            if publish_time_str:
                publish_time_str = publish_time_str.strip()
            else:
                return '2018-02-01 00:00:00'
            try:
                year = publish_time_str.split('/')[-3]
                mounth_days = publish_time_str.split('/')[-2]
                mount = mounth_days[0:2]
                days = mounth_days[2:4]
                return year + '-' + mount + '-' + days + ' 00:00:00'
            except Exception as e:
                return '2018-02-01 00:00:00'

        def deal_img_urls(img_urls_raw):
            img_urls_list = []
            for one_img_url in img_urls_raw:
                if 'empty.gif' in one_img_url:
                    continue
                else:
                    if 'http' not in one_img_url:
                        one_img_url = 'http:' + one_img_url
                        img_urls_list.append(one_img_url)

            return img_urls_list

        loader1 = ItemLoader(item=YfspiderspeakItem(), response=response)
        loader1.add_value('url', response.url)
        loader1.add_value('spider_time', time.time())
        loader1.add_xpath('title',
                          '//div[@id="main"]//div[@id="Article"]/h1//text()',
                          lambda x: ''.join([y for y in x]))
        loader1.add_xpath(
            'content',
            '//div[@id="main"]//div[@id="Article"]//article[@id="content"]//text()',
            lambda x: [i.strip() for i in x], Join())
        loader1.add_value('id', response.url.strip('/').split('/')[-1])
        loader1.add_xpath(
            'img_urls',
            '//div[@id="main"]//div[@id="Article"]//article[@id="content"]//img/@src|//div[@id="main"]//div[@id="Article"]//article[@id="content"]//img/@data-src',
            deal_img_urls)
        loader1.add_value('publish_time', response.url, deal_publish_time)
        loader1.add_xpath(
            'publish_user',
            '//div[@id="main"]//div[@id="Article"]//p[@id="editor"]//b/text()')
        loader1.add_xpath(
            'video_urls',
            '//div[@id="main"]//div[@id="Article"]//article[@id="content"]//iframe/@src'
        )

        item = loader1.load_item()
        print(item)
        return item
示例#15
0
    def parse_content(self, response):
        print('has get one_website', response.url)

        def deal_publish_time(publish_time_raw_list):
            mounth_str = str(publish_time_raw_list[0])
            day_str = str(publish_time_raw_list[1])
            year_str = str(publish_time_raw_list[2])

            mouth_transform = {
                'January': '01',
                'February': '02',
                'March': '03',
                'April': '04',
                'May': '05',
                'June': '06',
                'July': '07',
                'August': '08',
                'September': '09',
                'October': '10',
                'November': '11',
                'December': '12'
            }

            mounth_str_num = mouth_transform[mounth_str]

            publish_time_str = year_str + '-' + mounth_str_num + '-' + day_str + ' 00:00:00'
            return publish_time_str

        def deal_id(id):
            id_hash = hashlib.md5(id).hexdigest()
            return id_hash

        loaders1 = ItemLoader(response=response, item=YfspiderspeakItem())
        loaders1.add_value('url', response.url)
        loaders1.add_value('spider_time', time.time())
        loaders1.add_xpath(
            'title',
            '//*[@id="the-post"]/div[@class="post-inner"]/h1/span/text()')
        loaders1.add_value(
            'publish_time',
            response.xpath(
                '//*[@id="the-post"]//span[@class="tie-date"]/text()').re(
                    r'(\S*) (\d{1,2})\, (\d{1,4})'), deal_publish_time)
        loaders1.add_xpath(
            'content', '//div[@class="content"]//div[@class="entry"]//text()',
            Join())
        loaders1.add_value(
            'img_urls',
            response.xpath('//*[@id="the-post"]/div/div[@class="entry"]//img').
            re(r'src="(.*?)"'))
        loaders1.add_value('id',
                           response.url.split('chinese/')[1].strip('\/'),
                           deal_id)

        item1 = loaders1.load_item()
        return item1
示例#16
0
    def parse_content(self, response):
        print('in parseMore')

        def deal_publish_time(publish_time_list=[]):
            if publish_time_list:
                publish_time_str = publish_time_list
            else:
                return '2018-02-01 00:00:00'
            try:
                mounth_str = publish_time_list[0]
                day = publish_time_str[1]
                year = publish_time_list[2]

                mouth_transform = {
                    'January': '01',
                    'February': '02',
                    'March': '03',
                    'April': '04',
                    'May': '05',
                    'June': '06',
                    'July': '07',
                    'August': '08',
                    'September': '09',
                    'October': '10',
                    'November': '11',
                    'December': '12'
                }
                mounth_num = mouth_transform[str(mounth_str).strip()]
                if len(str(day).strip()) < 2:
                    day = '0' + str(day).strip()
                year = str(year)
                return year + '-' + mounth_num + '-' + day + ' 00:00:00'

            except:
                return '2018-02-01 00:00:00'

        loader1 = ItemLoader(item=YfspiderspeakItem(), response=response)
        loader1.add_value('url', response.url)
        loader1.add_value('spider_time', time.time())
        loader1.add_xpath('title',
                          '//div[@class="hideOnNavigation"]//h1//text()',
                          lambda x: ''.join([y for y in x]))
        loader1.add_xpath(
            'content', '//div[contains(@class,"newsContentArea")]//p//text()',
            lambda x: [i.strip() for i in x], Join())
        loader1.add_value('id', response.url.strip('/').split('/')[-1])
        loader1.add_xpath(
            'img_urls', '//div[contains(@class,"newsContentArea")]//img/@src')
        loader1.add_xpath('publish_time',
                          '//div[@class="hideOnNavigation"]//h1/span//text()',
                          deal_publish_time)

        item = loader1.load_item()
        print(item)
        return item
示例#17
0
    def parse_content(self, response):
        def deal_publish_time(publish_time_raw):
            if publish_time_raw:
                publish_time_DMY = publish_time_raw[0].split(' ')
                Day_str = publish_time_DMY[1].replace('th,', '')
                Mounth_str = publish_time_DMY[0]
                Year_str = publish_time_DMY[2]

                mouth_transform = {
                    'January': '01',
                    'February': '02',
                    'March': '03',
                    'April': '04',
                    'May': '05',
                    'June': '06',
                    'July': '07',
                    'August': '08',
                    'September': '09',
                    'October': '10',
                    'November': '11',
                    'December': '12'
                }
                mounth_str_num = mouth_transform[str(Mounth_str)]

                return Year_str + '-' + mounth_str_num + '-' + Day_str + ' 00:00:00'
            else:
                return '1111-11-11 11:11:11'

        print(response.url)
        content_loader = ItemLoader(response=response,
                                    item=YfspiderspeakItem())
        content_loader.add_value('url', response.url)
        content_loader.add_value('spider_time', time.time())
        content_loader.add_value('id', response.url.strip('/').split('/')[-1])

        content_loader.add_xpath(
            'title', '//div[@id="content"]//h2[@class="entry-title"]//text()')
        content_loader.add_xpath(
            'content',
            '//div[@id="content"]//div[@class="post-content"]//text()', Join())
        content_loader.add_value(
            'publish_time',
            response.xpath(
                '//div[@id="content"]//div[@class="fusion-meta-info"]//text()'
            ).re(r'\S* \d{1,2}th, \d{1,4}'), deal_publish_time)
        content_loader.add_xpath(
            'img_urls', '//div[@id="main"]//div[@id="content"]//img/@src')

        item1 = content_loader.load_item()
        return item1
示例#18
0
    def parse_content(self, response):
        def deal_img_urls(img_urls_raw):
            img_urls_list = []
            for one_img_url_raw in img_urls_raw:
                if 'mages/up' in one_img_url_raw or 'arrow_big_up' in one_img_url_raw:
                    continue
                if 'verein' in response.url:
                    img_url = 'http://www.chushigangdrug.ch/verein/' + one_img_url_raw.lstrip(
                        '.')
                elif 'tanzgruppe' in response.url:
                    img_url = 'http://www.chushigangdrug.ch/tanzgruppe/' + one_img_url_raw.strip(
                        '.')
                elif 'galerie' in response.url:
                    img_url = 'http://www.chushigangdrug.ch/galerie/' + one_img_url_raw.strip(
                        '.')
                else:
                    img_url = 'http://www.chushigangdrug.ch/' + one_img_url_raw.strip(
                        '.')
                img_urls_list.append(img_url)

            return img_urls_list

        print(response.url)
        if response.xpath('//table//table[@class="titel"]//tr/td'):
            content_loader = ItemLoader(response=response,
                                        item=YfspiderspeakItem())
            content_loader.add_value('url', response.url)
            content_loader.add_value('spider_time', time.time())
            content_loader.add_value(
                'id',
                response.url.strip('/').split('/')[-1].replace('.', '_'))

            content_loader.add_xpath(
                'title', '//table//table[@class="titel"]//tr/td/text()',
                lambda x: x[0].strip())
            content_loader.add_xpath(
                'content',
                '//td[@valign="top"]//td[@valign and @bgcolor="#FFFFFF"]//text()',
                Join())
            content_loader.add_value('publish_time', '2018-02-01 00:00:00')
            content_loader.add_xpath(
                'img_urls',
                '//td[@valign="top"]//td[@valign and @bgcolor="#FFFFFF"]//img[@width>50]/@src',
                deal_img_urls)

            item1 = content_loader.load_item()
            return item1
        else:
            print('unknown page')
示例#19
0
    def parse_content(self, response):
        '''
        这个函数在rulus中定义好了的一个callback,满足这个rules规则的所有链接的response都会交给这个方法来处理。

        :param response: 是一个标准的scrapy的response对象,详细可以参看官方帮助文档。常用功能,response.xpath(),response.css(),response.xpath().re(),response.css().re().....
        :return: 这是一个回调函数,返回的若是dict或者item(spiders文件夹同级目录中的items文件中定义的class),则会流经pipeline,若是requests,则会经过调度器调度,交给下载器去下载。
        '''
        print('in parseMore')

        def deal_publish_time(publish_time_list=[]):
            '''
            所有deal_...()这样的函数,都是为了处理函数名后边的字段的值。这就是这些函数的作用。相似的还有deal_publish_user,deal_img_urls...,使用回调的方式来使用这些函数。

            :param publish_time_list: xpath解析出来的默认都是list,
            :return: 处理好了的时间格式的字段
            '''
            if publish_time_list:
                publish_time_str = publish_time_list[0]
            else:
                return '2018-02-01 00:00:00'
            if '+' in publish_time_str:
                publish_time_str_split = publish_time_str.split('+')[0]
                return publish_time_str_split.replace('T', ' ')
            else:
                return '2018-02-01 00:00:00'

        loader1 = ItemLoader(item=YfspiderspeakItem(), response=response)
        loader1.add_value('url', response.url)
        loader1.add_value('spider_time', time.time())
        loader1.add_xpath('title',
                          '//article//h2[@class="entry-title"]/text()',
                          lambda x: ''.join([y for y in x]))
        loader1.add_xpath('content',
                          '//article//div[@class="entry-content"]/p//text()',
                          lambda x: [i.strip() for i in x], Join())
        loader1.add_value('id', response.url.strip('/').split('/')[-1])
        loader1.add_xpath('img_urls',
                          '//article//div[@class="entry-content"]//img/@src')
        loader1.add_xpath('publish_time',
                          '//article//time[@class="published"]/@datetime',
                          deal_publish_time)
        loader1.add_xpath(
            'publish_user',
            '//article//time[@class="published"]/a[@class="fn"]/text()')

        item = loader1.load_item()
        print(item)
        return item
示例#20
0
    def parse_content(self, response):
        print('in parseMore')

        def deal_publish_time(publish_time_str):
            if not publish_time_str:
                print('time is None')
                return None
            Re_find_publish_date = re.compile(
                'articles\/(\d{4})\/(\d{1,2})\/(\d{1,2})')
            publish_date_list = Re_find_publish_date.findall(publish_time_str)
            date_str_list = []
            for datestr in publish_date_list[
                    0]:  #findall找到的是一个包含tuple的list。找到的内容放到tuple中。
                if len(datestr) < 2:
                    datestr1 = '0' + str(datestr)
                    date_str_list.append(datestr1)
                else:
                    date_str_list.append(str(datestr))
            publish_time = date_str_list[0] + '-' + date_str_list[
                1] + '-' + date_str_list[2] + ' 00:00:00'
            return publish_time

        loader1 = ItemLoader(item=YfspiderspeakItem(), response=response)
        loader1.add_value('url', response.url)
        loader1.add_value('spider_time', time.time())
        loader1.add_xpath(
            'title',
            '//div[@id="master_container"]//div[contains(@class,"articleTitle")]/h1/text()',
            TakeFirst(), lambda x: x.strip())
        loader1.add_xpath(
            'content',
            '//div[@id="master_container"]//div[contains(@class,"articleContent")]//text()',
            lambda x: [i.strip() for i in x], Join())
        loader1.add_value('id',
                          response.url.split('-')[-1].split('.')[0].strip())
        loader1.add_xpath(
            'img_urls',
            '//div[@id="master_container"]//div[contains(@id,"articleContent")]//img/@src'
        )
        loader1.add_value('publish_time', response.url, deal_publish_time)
        # loader1.add_value('publish_user','degewa')
        # loader1.add_value('reply_count',response.selector.xpath('//*[@id="comments"]/h4/text()').re(ur'(\d{1,2}).*条评论'),lambda x:x[0] if x else 0)
        # loader1.add_value('reply_nodes',response.selector.re(ur'var items \= (\[.*?\])\;'),deal_reply_nodes)

        item = loader1.load_item()
        print(item)
        return item
示例#21
0
    def parse_content(self, response):
        print('in parseMore')

        def deal_publish_time(publish_time_list=[]):
            if publish_time_list:
                publish_time_str = publish_time_list[0]
                publish_time_str = publish_time_str[0] if type(
                    publish_time_str) is type(()) else publish_time_str
            else:
                return '2018-02-01 00:00:00'
            if publish_time_str:
                return publish_time_str.replace('T', ' ')
            else:
                return '2018-02-01 00:00:00'

        loader1 = ItemLoader(item=YfspiderspeakItem(), response=response)
        loader1.add_value('url', response.url)
        loader1.add_value('spider_time', time.time())
        loader1.add_xpath('title',
                          '//div[@class="page-container"]//header/h1/text()',
                          lambda x: ''.join([y for y in x]))
        loader1.add_xpath(
            'content',
            '//div[@class="page-container"]//div[@itemprop="articleBody"]//text()',
            lambda x: [i.strip() for i in x], Join())
        loader1.add_value('id', response.url.strip('/').split('/')[-1])
        loader1.add_xpath(
            'img_urls',
            '//div[@class="page-container"]//div[@itemprop="articleBody"]//img/@src'
        )
        loader1.add_value(
            'publish_time',
            response.xpath(
                '//head/script[@type="application/ld+json"]//text()').
            re('\"dateCreated\"\:\"(\d{4}\-\d{2}\-\d{2}T\d{2}\:\d{2}\:\d{2})Z"'
               ), deal_publish_time)
        loader1.add_xpath(
            'publish_user',
            '//article//time[@class="published"]/a[@class="fn"]/text()')
        loader1.add_xpath(
            'video_urls',
            '//div[@class="page-container"]//div[@itemprop="articleBody"]//iframe/@src'
        )

        item = loader1.load_item()
        print(item)
        return item
示例#22
0
    def parse_content(self, response):
        print('in parseMore')

        def deal_publish_time(publish_url_raw):
            publish_time = publish_url_raw.split('/')[-1]
            publish_time_split = publish_time.split('-')
            publish_time_dealed = publish_time_split[
                1] + '-' + publish_time_split[2] + '-' + publish_time_split[
                    3] + ' ' + publish_time_split[4] + ':' + publish_time_split[
                        5] + ':' + publish_time_split[6]

            return publish_time_dealed

        def deal_img_urls(img_urls_raw):
            img_urls = []
            if img_urls_raw:
                for one_img_url in img_urls_raw:
                    if 'www.kirti92.org' not in one_img_url:
                        img_urls.append('http://www.kirti92.org' + one_img_url)
                    else:
                        img_urls.append(one_img_url)
            return img_urls

        loader1 = ItemLoader(item=YfspiderspeakItem(), response=response)
        loader1.add_value('url', response.url)
        loader1.add_value('spider_time', time.time())
        loader1.add_xpath('title', '//div[@id="wrapper"]//h2/a/text()',
                          TakeFirst(), lambda x: x.strip())
        loader1.add_xpath(
            'content',
            '//div[@id="centercontent_bg"]//div[@class="item-page"]/p//text()',
            lambda x: [i.strip() for i in x], Join())
        loader1.add_value(
            'id',
            response.url.strip('/').split('/')[-1].split('.')[0].strip())
        loader1.add_xpath(
            'img_urls',
            '//div[@id="centercontent_bg"]//div[@class="item-page"]//img/@src',
            deal_img_urls)
        loader1.add_value('publish_time', response.url, deal_publish_time)
        # loader1.add_value('publish_user','degewa')
        # loader1.add_value('reply_count',response.selector.xpath('//*[@id="comments"]/h4/text()').re(ur'(\d{1,2}).*条评论'),lambda x:x[0] if x else 0)
        # loader1.add_value('reply_nodes',response.selector.re(ur'var items \= (\[.*?\])\;'),deal_reply_nodes)

        item = loader1.load_item()
        print(item)
        return item
示例#23
0
文件: ntd.py 项目: possager/YFspider2
    def parse_content(self, response):
        print('in parseMore')

        def deal_publish_time(publish_time_list=''):
            if publish_time_list:
                publish_time_str = publish_time_list.split('ntd.tv/')[1]
            else:
                return '2018-02-01 00:00:00'

            try:
                time_splited = publish_time_str.split('/')
                year = time_splited[0]
                mounth = time_splited[1]
                days = time_splited[2]

                return str(year) + '-' + str(mounth) + '-' + str(
                    days) + ' 00:00:00'
            except:
                return '2018-02-01 00:00:00'

        loader1 = ItemLoader(item=YfspiderspeakItem(), response=response)
        loader1.add_value('url', response.url)
        loader1.add_value('spider_time', time.time())
        loader1.add_xpath('title', '//main[@id="main"]//h1//text()',
                          lambda x: ''.join([y for y in x]).strip())
        loader1.add_xpath(
            'content',
            '//main[@id="main"]//div[@class="left_block"]/div[contains(@class,"content")]//p//text()',
            lambda x: [i.strip() for i in x], Join())
        loader1.add_value('id', response.url.strip('/').split('/')[-1])
        loader1.add_xpath(
            'img_urls',
            '//main[@id="main"]//div[@class="left_block"]/div[contains(@class,"content")]//img/@src'
        )
        loader1.add_value('publish_time', response.url, deal_publish_time)
        loader1.add_xpath(
            'publish_user',
            '//main[@id="main"]//div[@class="author"]//span[@class="author_name"]/text()'
        )
        loader1.add_xpath(
            'video_urls',
            '//main[@id="main"]//div[@class="left_block"]/div[contains(@class,"content") or contains(@class,"container")]//video/@src'
        )

        item = loader1.load_item()
        print(item)
        return item
示例#24
0
    def parse_content(self, response):
        print('in parseMore')

        def deal_publish_time(publish_time_list=[]):
            if publish_time_list:
                publish_time_str = publish_time_list[0]
            else:
                return '2018-02-01 00:00:00'
            if '+' in publish_time_str:
                publish_time_str_split = publish_time_str.split('+')[0]
                return publish_time_str_split.replace('T', ' ')
            else:
                return '2018-02-01 00:00:00'

        loader1 = ItemLoader(item=YfspiderspeakItem(), response=response)
        loader1.add_value('url', response.url)
        loader1.add_value('spider_time', time.time())
        loader1.add_xpath(
            'title',
            '//div[@class="article-content clearfix"]//h1[@class="entry-title"]/text()',
            lambda x: ''.join([y.strip() for y in x]))
        loader1.add_xpath(
            'content',
            '//div[@class="article-content clearfix"]//div[@class="entry-content clearfix"]/p//text()',
            lambda x: [i.strip() for i in x], Join())
        loader1.add_value('id', response.url.strip('/').split('/')[-1])
        loader1.add_xpath(
            'img_urls',
            '//div[@class="article-content clearfix"]//div[@class="entry-content clearfix"]/p//img/@src'
        )
        loader1.add_xpath(
            'publish_time',
            '//div[@class="article-content clearfix"]//div[@class="below-entry-meta"]//time[@class="entry-date published"]/@datetime',
            deal_publish_time)
        loader1.add_xpath(
            'publish_user',
            '//div[@class="article-content clearfix"]//div[@class="below-entry-meta"]/span[@class="byline"]//a[@class="url fn n"]/@title'
        )
        loader1.add_xpath(
            'read_count',
            '//div[@class="article-content clearfix"]//div[@class="below-entry-meta"]/span[@class="post-views"]//span[@class="total-views"]//text()'
        )

        item = loader1.load_item()
        print(item)
        return item
示例#25
0
    def parse_content(self, response):
        def deal_publish_time(publish_time_url):
            print('in deal_publish_time', publish_time_url)
            try:
                publish_time_url = publish_time_url[0]
            except:
                return None
            publish_time_date_str = publish_time_url.split('org/')[1].strip(
                '/').replace('/', '-')
            publish_time = publish_time_date_str + ' 00:00:00'
            return publish_time

        def deal_id(id):
            id_hash = hashlib.md5(id).hexdigest()
            return id_hash

        print(response.url)
        content_loader = ItemLoader(item=YfspiderspeakItem(),
                                    response=response)

        content_loader.add_value('url', response.url)
        content_loader.add_value('spider_time', time.time())

        content_loader.add_xpath(
            'title',
            '//article[@id]/div[@class="gdlr-standard-style"]/div[@class="blog-content-wrapper"]/header/h1/text()'
        )
        content_loader.add_xpath(
            'content',
            '//article[@id]/div[@class="gdlr-standard-style"]/div[@class="blog-content-wrapper"]/div[@class="gdlr-blog-content"]/p/text()',
            Join())
        content_loader.add_xpath(
            'publish_time',
            '//article[@id]/div[@class="gdlr-standard-style"]/div[@class="blog-content-wrapper"]/header[@class="post-header"]/div[@class="gdlr-blog-info gdlr-info"]/div[@class="blog-info blog-date"]/a[@href]//@href',
            deal_publish_time)
        content_loader.add_value(
            'img_urls',
            response.xpath('//div[@class="blog-content-wrapper"]//img').re(
                'src="(.*?)"'))
        content_loader.add_value('id',
                                 response.url.strip('/').split('/')[-1],
                                 deal_id)
        item = content_loader.load_item()
        print(item)
        return item
示例#26
0
    def parse_content(self, response):
        print('in parseMore')

        def deal_publish_time(publish_time):
            if publish_time:
                #2017-03-29T11:52:42+00:00
                try:
                    publish_time_str_raw = publish_time[0]
                    publish_time_splited = publish_time_str_raw.split('T')
                    publish_date = publish_time_splited[0]
                    publish_hours = publish_time_splited[1].split('+')[0]

                    return publish_date + ' ' + publish_hours

                except Exception as e:
                    print(e)
                    return None
            else:
                return None

        loader1 = ItemLoader(item=YfspiderspeakItem(), response=response)
        loader1.add_value('url', response.url)
        loader1.add_value('spider_time', time.time())
        loader1.add_xpath('title', '//h1[@class="entry-title"]/text()',
                          lambda x: x[0].strip())
        loader1.add_xpath('content',
                          '//div[@class="td-post-content"]//p/text()',
                          lambda x: [onegraph.strip()
                                     for onegraph in x], Join())
        loader1.add_value(
            'id',
            response.url.strip('/').split('/')[-1].split('.')[0].strip('/'))
        loader1.add_value(
            'img_urls',
            response.xpath(
                '//div[@class="td-post-content"]//img/@src').extract())
        loader1.add_xpath(
            'publish_time',
            '//div[@class="td-module-meta-info"]/span/time/@datetime',
            deal_publish_time)
        loader1.add_xpath('video_urls', '//iframe[@gesture="media"]/@src')

        item = loader1.load_item()
        print(item)
        return item
示例#27
0
    def parse_content(self, response):
        print('in parseMore')

        def deal_publish_time(publish_time_list=[]):
            if publish_time_list:
                publish_time_str = publish_time_list
            else:
                return '2018-02-01 00:00:00'
            try:
                publish_time = publish_time_str[0] + '-' + publish_time_str[
                    1] + '-' + publish_time_str[2] + ' 00:00:00'
                return publish_time
            except:
                return '2018-02-01 00:00:00'

        # def deal_publish_user(publish_user_list=[]):

        loader1 = ItemLoader(item=YfspiderspeakItem(), response=response)
        loader1.add_value('url', response.url)
        loader1.add_value('spider_time', time.time())
        loader1.add_xpath('title', '//div[@id="bodyContent"]//h1/text()',
                          lambda x: ''.join([y for y in x]))
        loader1.add_xpath(
            'content',
            '//div[@id="bodyContent"]//div[@class="group"]//div/p//text()',
            lambda x: [i.strip() for i in x], Join())
        loader1.add_value('id', response.url.strip('/').split('/')[-1])
        loader1.add_xpath(
            'img_urls',
            '//div[@id="bodyContent"]//div[@class="group"]//div/p//img/@src')
        loader1.add_value(
            'publish_time',
            response.xpath(
                '//div[@id="bodyContent"]//div[@class="col3"]//div[@class="group"]/ul/li[1]//text()'
            ).re('(\d{2})\.(\d{2})\.(\d{4})'), deal_publish_time)
        loader1.add_value(
            'publish_user',
            response.xpath(
                '//div[@id="bodyContent"]//div[@class="col3"]//div[@class="group"]/ul/li//strong[contains(text(),"作者")]/../text()'
            ).extract(), lambda x: ''.join([str(y).strip() for y in x]))

        item = loader1.load_item()
        print(item)
        return item
示例#28
0
    def parse_content(self, response):
        print('in parseMore')

        def deal_publish_time(publish_time_list=[]):
            if publish_time_list:
                publish_time_str = publish_time_list[0]
            else:
                return '2018-02-01 00:00:00'
            if '+' in publish_time_str:
                publish_time_str_split = publish_time_str.split('+')[0]
                return publish_time_str_split.replace('T', ' ')
            else:
                return '2018-02-01 00:00:00'

        loader1 = ItemLoader(item=YfspiderspeakItem(), response=response)
        loader1.add_value('url', response.url)
        loader1.add_value('spider_time', time.time())
        loader1.add_xpath(
            'title',
            '//div[@class="main"]//div[@class="article-main"]//h1//text()',
            lambda x: ''.join([y for y in x]))
        loader1.add_xpath(
            'content',
            '//div[@class="main"]//section[@class="article-content"]//text()',
            lambda x: [i.strip() for i in x], Join())
        loader1.add_value('id', response.url.strip('/').split('/')[-1])
        loader1.add_xpath(
            'img_urls',
            '//div[@class="main"]//section[@class="article-content"]//img/@src'
        )
        loader1.add_xpath(
            'publish_time',
            '//div[@class="main"]//dd[@class="published hasTooltip"]//time/@datetime',
            deal_publish_time)
        loader1.add_xpath(
            'publish_user',
            '//div[@class="main"]//dd[@itemprop="author"]//span[@itemprop="name"]/text()'
        )

        item = loader1.load_item()
        print(item)
        return item
示例#29
0
    def parse_content(self,response):
        print ('in parseMore')


        def deal_publish_time(publish_time_list=[]):
            if publish_time_list:
                publish_time_str=publish_time_list[0]
            else:
                return '2018-02-01 00:00:00'
            if '+' in publish_time_str:
                publish_time_str_split=publish_time_str.split('+')[0]
                return publish_time_str_split.replace('T',' ')
            else:
                return '2018-02-01 00:00:00'


        def deal_publish_user(publish_user_raw):
            if publish_user_raw:
                publish_user_str=''.join(publish_user_raw).strip()
                try:
                    return publish_user_str.split(',')[:-1]
                except :
                    return publish_user_raw
            else:
                return publish_user_raw



        loader1=ItemLoader(item=YfspiderspeakItem(),response=response)
        loader1.add_value('url',response.url)
        loader1.add_value('spider_time',time.time())
        loader1.add_xpath('title','//div[@class="main"]//article//header//h1//text()',lambda x:''.join([y for y in x]).strip())
        loader1.add_xpath('content','//div[@class="main"]//article//div[@class="article-content-main"]//p//text()',lambda x:[i.strip() for i in x],Join())
        loader1.add_value('id',response.url.strip('/').split('/')[-1])
        loader1.add_xpath('img_urls','//div[@class="main"]//article//div[@class="article-content-main"]//section/p//img/@src',lambda x:['http://www.thetibetpost.com'+y for y in x if 'www.thetibetpost.com' not in y])
        loader1.add_xpath('publish_time','//div[@class="main"]//article//dd//time[@datetime]/@datetime',deal_publish_time)
        loader1.add_xpath('publish_user','//div[@class="main"]//article//dd[contains(@class,"createdby")]//span/text()',deal_publish_user)


        item=loader1.load_item()
        print (item)
        return item
示例#30
0
    def parse_content(self, response):
        def deal_id(id_raw):
            id_str = id_raw[0]
            return hashlib.md5(id_str).hexdigest()

        print(response.url)
        content_loader = ItemLoader(response=response,
                                    item=YfspiderspeakItem())
        content_loader.add_value('url', response.url)
        content_loader.add_value('spider_time', time.time())

        content_loader.add_xpath(
            'title',
            '//div[@class="breadcrumb-inner"]/div[@class="subtitle"]/h2/text()'
        )
        content_loader.add_xpath(
            'content',
            '//div[@class="container"]//div[@class="detail_text rich_editor_text"]//p/text()',
            Join())
        content_loader.add_xpath(
            'publish_time',
            '//div[@class="container"]//ul[@class="post-options"]//time/@datetime',
            lambda x: x[0] + ' 00:00:00')
        content_loader.add_value('id',
                                 response.url.strip('/').split('/')[-1],
                                 deal_id)
        content_loader.add_xpath(
            'publish_user',
            '//div[@class="container"]//ul[@class="post-options"]//li/i[@class="icon icon-user"]/ancestor::li/a[@href]/text()'
        )
        content_loader.add_xpath(
            'publish_user_id',
            '//div[@class="container"]//ul[@class="post-options"]//li/i[@class="icon icon-user"]/'
            'ancestor::li/a[@href]/@href',
            lambda x: x[0].strip('/').split('/')[-1])
        content_loader.add_value(
            'img_urls',
            response.xpath(
                '//div[@id="main"]/div[@class="container"]/div[@class="row"]//img/@src'
            ).re(r'http://.*'))
        item1 = content_loader.load_item()
        return item1