def parse_content(self, response): def deal_id(id_raw): return hashlib.md5(id_raw).hexdigest() print(response.url) content_loader = ItemLoader(response=response, item=YfspiderspeakItem()) content_loader.add_value('url', response.url) content_loader.add_value('spider_time', time.time()) content_loader.add_xpath( 'title', '//div[@id="content"]/div[@id="content-main"]//div[@class="entry clearfix"]//h1[@class="post-title entry-title"]//text()', lambda x: x[0].strip() if x else None) content_loader.add_xpath( 'content', '//div[@id="content"]/div[@id="content-main"]//div[@class="entry clearfix"]/div[@class="entry-content clearfix"]/p//text()', Join()) content_loader.add_xpath( 'publish_time', '//div[@id]/div[@class="entry clearfix"]/div[@class="date updated alpha with-year"]/span/@title', lambda x: x[0].replace('T', ' ') + ':00' if x else None) content_loader.add_value('id', response.url.strip('/').split('/')[-1], deal_id) content_loader.add_xpath('img_urls', '//div[@id="content-main"]//img/@src') content_loader.add_xpath( 'video_urls', '//div[@id="content"]//div[@class="hentry-container clear"]//iframe/@src' ) item1 = content_loader.load_item() return item1
def parse_content(self,response): print ('in parseMore') def deal_publish_time(publish_time_list=[]): if publish_time_list: publish_time_str=publish_time_list[0] else: return '2018-02-01 00:00:00' if '+' in publish_time_str: publish_time_str_split=publish_time_str.split('+')[0] return publish_time_str_split.replace('T',' ') else: return '2018-02-01 00:00:00' loader1=ItemLoader(item=YfspiderspeakItem(),response=response) loader1.add_value('url',response.url) loader1.add_value('spider_time',time.time()) loader1.add_xpath('title','//div[@id="content"]//h1[@class="pg-title"]/text()',lambda x:''.join([y for y in x])) loader1.add_xpath('content','//div[@id="content"]//div[@id="article-content"]/div[@class="wsw"]//text()',lambda x:[i.strip() for i in x],Join()) loader1.add_value('id',response.url.strip('/').split('/')[-1]) loader1.add_xpath('img_urls','//div[@id="content"]//div[@id="article-content"]/div[@class="wsw"]//img/@src') loader1.add_xpath('publish_time','//div[@id="content"]//div[@class="published"]//time/@datetime',deal_publish_time) # loader1.add_xpath('publish_user','//article//time[@class="published"]/a[@class="fn"]/text()') #https://www.voachinese.com/comments/a4392613p0.html item=loader1.load_item() print (item) return item
def parse_content(self,response): print ('in parseMore') def deal_publish_time(publish_time_list=[]): if publish_time_list: publish_time_str=''.join(publish_time_list).strip() else: return '2018-02-01 00:00:00' try: return publish_time_str+' 00:00:00' except: return '2018-02-01 00:00:00' loader1=ItemLoader(item=YfspiderspeakItem(),response=response) loader1.add_value('url',response.url) loader1.add_value('spider_time',time.time()) loader1.add_xpath('title','//div[@id="abovefold"]//div[@id="storypagemaincol"]//h1/text()',lambda x:''.join([y for y in x])) loader1.add_xpath('content','//div[@id="abovefold"]//div[@id="storypagemaincol"]//div[@id="storytext"]//text()',lambda x:[i.strip() for i in x],Join()) loader1.add_value('id',response.url.strip('/').split('/')[-1]) loader1.add_xpath('img_urls','//div[@id="abovefold"]//div[@id="storypagemaincol"]//div[@id="storytext"]//img/@src',lambda x:[y for y in x if 'icon-' not in y]) loader1.add_xpath('publish_time','//div[@id="abovefold"]//div[@id="storypagemaincol"]//div[@id="storytop"]//span[@id="story_date"]//text()',deal_publish_time) # loader1.add_xpath('publish_user','//article//time[@class="published"]/a[@class="fn"]/text()') item=loader1.load_item() print (item) return item
def parse_content(self, response): print('in parseMore') loader1 = ItemLoader(item=YfspiderspeakItem(), response=response) loader1.add_value('url', response.url) loader1.add_value('spider_time', time.time()) loader1.add_xpath( 'title', '//div[@class="page"]//div[@class="subject_bg1 nav"]//text()', lambda x: ''.join([y for y in x])) loader1.add_xpath( 'content', '//div[@class="page"]//div[@class="topic_body"]//text()', lambda x: [i.strip() for i in x], Join()) loader1.add_value('id', response.url.split('t=')[-1].split('&')[0]) loader1.add_xpath( 'img_urls', '//div[@class="page"]//div[@class="topic_body"]//img/@src') loader1.add_value('publish_time', '2018-02-01 00:00:00') loader1.add_value('publish_user', 'dalailamaworld') loader1.add_xpath( 'video_urls', '//div[@class="page"]//div[@class="topic_body"]//iframe/@src') item = loader1.load_item() print(item) return item
def parse_content(self, response): print(response.url) def deal_img_urls(img_urls_raw): img_urls_dealed = [] for one_url in img_urls_raw: if 'download-pdf' in one_url: continue if 'http' not in one_url: one_url_dealed = urljoin('http://www.savetibet.org/', one_url) img_urls_dealed.append(one_url_dealed) return img_urls_dealed if response.xpath( '//div[@id="content"]//div[@id="main"]//h1[@class="title"]'): #表明有title这个标签,版式就是统一的 # print response.xpath('//div[@id="content"]//div[@id="main"]//h1[@class="title"]/text()').extract() content_laoder = ItemLoader(response=response, item=YfspiderspeakItem()) content_laoder.add_value('url', response.url) content_laoder.add_value('spider_time', time.time()) content_laoder.add_xpath( 'title', '//div[@id="content"]//div[@id="main"]//h1[@class="title"]//text()', lambda x: x[0].strip()) content_laoder.add_xpath( 'content', '//div[@id="content"]//div[@id="main"]//div[@class="entry"]//text()', Join()) content_laoder.add_xpath( 'img_urls', '//div[@id="main"]//div[@class="entry"]//img/@src', deal_img_urls) content_laoder.add_xpath('video_urls', '//div[@class="entry"]//iframe/@src') content_laoder.add_value('id', response.url.strip('/').split('/')[-1]) if response.xpath('//div[@class="post-meta"]'): content_laoder.add_value( 'publish_time', response.xpath( '//div[@class="post-meta"]//abbr[@class="date time published"]/@title' ).re('\d{4}\-\d{2}-\d{2}T\d{2}\:\d{2}:\d{2}'), lambda x: x[0].replace('T', ' ') if x else None) content_laoder.add_xpath( 'publish_user', '//div[@class="post-meta"]/span[@class="author vcard"]/span[@class="fn"]/a/text()' ) # content_laoder.add_xpath('') else: content_laoder.add_value('publish_time', '1111-11-11 11:11:11') item1 = content_laoder.load_item() return item1 else: print('no,it not content page')
def parse_content(self,response): print ('in parseMore') def deal_publish_time(publish_time_list=[]): if publish_time_list: publish_time_str=publish_time_list[0] else: return '2018-02-01 00:00:00' if '+' in publish_time_str: publish_time_str_split=publish_time_str.split('+')[0] return publish_time_str_split.replace('T',' ') else: return '2018-02-01 00:00:00' loader1=ItemLoader(item=YfspiderspeakItem(),response=response) loader1.add_value('url',response.url) loader1.add_value('spider_time',time.time()) loader1.add_xpath('title','//main//div[@class="bd"]//h1[contains(@class,"title")]//text()',lambda x:''.join([y for y in x])) loader1.add_xpath('content','//main//div[@id="artbody"]/p//text()',lambda x:[i.strip() for i in x],Join()) loader1.add_value('id',response.url.strip('/').split('/')[-1].split('.')[0]) loader1.add_xpath('img_urls','//main//div[@id="artbody"]/p//img/@src') loader1.add_xpath('publish_time','//main//div[@id="artbody"]//time/@datetime',deal_publish_time) # loader1.add_xpath('publish_user','//article//time[@class="published"]/a[@class="fn"]/text()') item=loader1.load_item() print (item) return item
def parse_photo(self, response): def deal_img_urls(img_urls): img_result = [] for one_img in img_urls: img_urls = urljoin('http://www.tibetswiss.ch/', one_img) img_result.append(img_urls) return img_result print(response.url) # print 'in photo' content_loader = ItemLoader(response=response, item=YfspiderspeakItem()) content_loader.add_value('url', response.url) content_loader.add_value('spider_time', time.time()) content_loader.add_xpath( 'title', '//div[@id="main"]//div[@class="inside"]//div[@class="title"]/h1/text()' ) # content_loader.add_xpath('content', '//div[@id="main"]/div[@class="inside"]//div[@class="ce_text"]//p/text()', # Join()) content_loader.add_xpath( 'img_urls', '//div[@id="main"]//div[@class="inside"]//img/@src', deal_img_urls) content_loader.add_value('publish_time', '1111-11-11 11:11:11') content_loader.add_value( 'id', response.url.strip('/').split('/')[-1].split('.html')[0]) item1 = content_loader.load_item() return item1
def parse_content(self,response): print ('in parseMore') def deal_publish_time(publish_time_list=[]): if publish_time_list: publish_time_str=publish_time_list[0] else: return '2018-02-01 00:00:00' return publish_time_str.strip() loader1=ItemLoader(item=YfspiderspeakItem(),response=response) loader1.add_value('url',response.url) loader1.add_value('spider_time',time.time()) loader1.add_xpath('title','//div[@class="main"]//div[@class="dia-lead"]//h1/text()',lambda x:''.join([y for y in x])) loader1.add_xpath('content','//div[@class="main"]//div[@class="dia-lead-one"]/div[@id]//text()',lambda x:[i.strip() for i in x],Join()) loader1.add_value('id',response.url.strip('/').split('/')[-1].split('.')[0]) loader1.add_xpath('img_urls','//div[@class="main"]//div[@class="dia-lead-one"]/div[@id]//img/@src ') loader1.add_xpath('publish_time','//div[@class="main"]//div[@class="dia-lead"]//div[@class="sign1"]/div[@class="r"]/text()',deal_publish_time) item=loader1.load_item() print (item) return item # def parse_comments(self,response): # pass
def parse_content(self,response): print ('in parseMore') def deal_publish_time(publish_time_list=[]): try: year=publish_time_list[0] mounth=publish_time_list[1] days=publish_time_list[2] return str(year)+'-'+str(mounth)+'-'+str(days)+' 00:00:00' except Exception as e: return '2018-02-01 00:00:00' loader1=ItemLoader(item=YfspiderspeakItem(),response=response) loader1.add_value('url',response.url) loader1.add_value('spider_time',time.time()) loader1.add_xpath('title','//div[@id="container"]//div[@id="main"]//h1[@class="entry_title"]//text()',lambda x:''.join([y for y in x]).strip()) loader1.add_xpath('content','//div[@id="container"]//div[@id="main"]//p/text()',lambda x:[i.strip() for i in x],Join()) loader1.add_value('id',response.url.strip('/').split('/')[-1].split('.')[0]) loader1.add_xpath('img_urls','//div[@id="container"]//div[@id="main"]//img/@src') loader1.add_xpath('video_urls','//div[@id="container"]//div[@id="main"]//p//iframe[@allow]/@src') loader1.add_value('publish_time',response.xpath('//div[@id="container"]//div[@id="main"]//div[@class="singlepostmeta"]//text()').re('(\d{4})\/(\d{2})\/(\d{2})'),deal_publish_time) # loader1.add_xpath('read_count','//div[@align="center"]//td[@align="right"]//font[@color="red"]/text()') loader1.add_xpath('publish_user','//div[@id="container"]//div[@id="main"]//a[@rel="author"]//text()',lambda x:''.join([y for y in x])) item=loader1.load_item() print (item) return item
def parse_content(self,response): print ('in parseMore') def deal_publish_time(publish_time_list=[]): if publish_time_list: publish_time_str=publish_time_list[0] else: return '2018-02-01 00:00:00' try: zangwendict3 = { u"༧": u"7", u"༦": u"6", u"༥": u"5", u"༤": u"4", u"༣": u"3", u"༢": u"2", u"༡": u"1", u"༠": u"0", u"༩": u"9", u"༨": u"8" } for onekey in zangwendict3.keys(): if onekey in publish_time_list: publish_time_str = publish_time_str.replace(onekey, zangwendict3[onekey]) Re_find_time=re.compile(r'(\d{1,2}).*?(\d{1,2}).*?(\d{4})') publish_time_1=Re_find_time.findall(publish_time_str)#[(u'5', u'19', u'2018')] publish_time_2=publish_time_1[0] mounth=str(publish_time_2[0]) days=str(publish_time_2[1]) year=str(publish_time_2[2]) if len(mounth)<2: mounth='0'+mounth if len(days)<2: days='0'+days return year+'-'+mounth+'-'+days+' 00:00:00' except: return '2018-02-01 00:00:00' loader1=ItemLoader(item=YfspiderspeakItem(),response=response) loader1.add_value('url',response.url) loader1.add_value('spider_time',time.time()) loader1.add_xpath('title','//main//div[@class="page-header"]//h1//text()',lambda x:''.join([y for y in x])) loader1.add_xpath('content','//main//div[@class="entry-content"]//p//text()',lambda x:[i.strip() for i in x],Join()) loader1.add_value('id',response.url.strip('/').split('/')[-1]) loader1.add_xpath('img_urls','//main//div[@class="entry-content"]//p//img/@src') loader1.add_xpath('publish_time','//header//div[@class="single_meta_item single_meta_date"]//text()',deal_publish_time) loader1.add_xpath('publish_user','//header//div[@id="single_byline"]//text()[2]') item=loader1.load_item() print (item) return item
def parse_content(self, response): print('in parseMore') def deal_publish_time(publish_time_list): #20180502 if not publish_time_list: return '2018-05-09 00:00:00' else: publish_time_list = publish_time_list.replace('-', '') year = publish_time_list[0:4] mounth = publish_time_list[4:6] day = publish_time_list[6:8] return year + '-' + mounth + '-' + day + ' 00:00:00' def deal_read_count(read_count=None): if read_count: return int(str(''.join(read_count))) else: return 0 def deal_img_urls(img_urls_raw): urlList = [] for one_img_url in img_urls_raw: if 'printButton' in img_urls_raw or 'emailButton' in one_img_url: continue if 'https://www.kagyuoffice.org.tw' not in one_img_url: if one_img_url.startswith('/'): one_img_url_dealed = 'https://www.kagyuoffice.org.tw' + one_img_url urlList.append(one_img_url_dealed) loader1 = ItemLoader(item=YfspiderspeakItem(), response=response) loader1.add_value('url', response.url) loader1.add_value('spider_time', time.time()) loader1.add_xpath( 'title', '//div[@id="gj-main-content"]//div[@class="span9"]//div[@class="item-page"]//h2//a//text()', lambda x: ''.join([y for y in x])) loader1.add_xpath( 'content', '//div[@id="gj-main-content"]//div[@class="span9"]//div[@class="item-page"]/p//text()', lambda x: [i.strip() for i in x], Join()) loader1.add_value('id', response.url.split('news/')[-1]) loader1.add_xpath( 'img_urls', '//div[@id="gj-main-content"]//div[@class="span9"]//div[@class="item-page"]//img/@src', deal_img_urls) loader1.add_value('publish_time', response.url.split('news/')[-1], deal_publish_time) loader1.add_value('publish_user', 'kagyuoffice') loader1.add_value( 'read_count', response.xpath( '//div[@class="item-page"]//dd[@class="gj-hits"]//text()').re( '.*?(\d*)'), deal_read_count) item = loader1.load_item() print(item) return item
def parse_content(self,response): print (response.url) def deal_img_urls(img_url_list): # for one_img_url in img_url_list: # print (one_img_url) return img_url_list def deal_publish_time(publish_time_raw_list): try: year=str(publish_time_raw_list[0]) mounth=str(publish_time_raw_list[1]) if len(str(publish_time_raw_list[1]))==2 else '0'+str(publish_time_raw_list[1]) days=str(publish_time_raw_list[2]) if len(str(publish_time_raw_list[2]))==2 else '0'+str(publish_time_raw_list[2]) hourse=str(publish_time_raw_list[3]) minite=str(publish_time_raw_list[4]) publish_time=year+'-'+mounth+'-'+days+' '+hourse+':'+minite+':00' return publish_time except Exception as e: print(e) def deal_reply_nodes(response_url): # for one_reply_nodes in reply_nodes: # one_reply_nodes.xpath('') # 这里边的评论需要重新发起请求,所以这里全部设置成连接,后期的处理中再生成对应的reply_nodes。------mark! reply_id=response_url.split('/')[-1].split('?')[0] reply_url='http://www.ftchinese.com/index.php/c/newcomment/'+reply_id+'?v=1' return reply_url def deal_publish_user(publisher_list): publish_user_list=[] for one_user in publisher_list: _=one_user.strip() publish_user_list.append(_) return publish_user_list if not response.xpath('//span[@class="story-time"]/text()').re('(\d{4}).(\d{1,2}).(\d{1,2}). (\d{1,2})\:(\d{1,2})'): return #charge the content is empty by this? loader1 = ItemLoader(item=YfspiderspeakItem(), response=response) loader1.add_value('url', response.url) loader1.add_value('spider_time', time.time()) loader1.add_xpath('title','//h1[@class="story-headline"]/text()',TakeFirst()) # loader1.add_xpath('abstract','//div[@class="story-lead"]/text()')#没有abstract这个字段 loader1.add_value('id',response.url.split('/')[-1].split('?')[0]) loader1.add_value('img_urls',response.xpath('//div[@class="story-container"]//img/@src|//div[@class="story-container"]//figure/@data-url').extract(),deal_img_urls) loader1.add_xpath('content','//div[@class="story-body"]//p//text()',Join()) loader1.add_value('publish_time',response.xpath('//span[@class="story-time"]/text()').re('(\d{4}).(\d{1,2}).(\d{1,2}). (\d{1,2})\:(\d{1,2})'),deal_publish_time) loader1.add_xpath('publish_user','//span[@class="story-author"]/a/text()',deal_publish_user) loader1.add_value('reply_count',response.xpath('//div[@id="allcomments"]/div[@class="commentcontainer"]'),lambda x:len(x)) # loader1.add_value('reply_nodes',response.url,deal_reply_nodes) item1=loader1.load_item() return item1
def parse_content(self, response): def deal_publish_time(publish_time_raw): publish_time_str = publish_time_raw.pop().strip() if len(publish_time_str) == 16: publish_time_str1 = publish_time_str + ':00' else: publish_time_str1 = publish_time_str return publish_time_str1 def deal_content(content_list_raw): content_str = '' for one_content in content_list_raw: content_str += one_content.strip() return content_str def deal_img_urls(img_urls_raw): img_url_list = [] for one_img_url in img_urls_raw: if 'http' not in one_img_url: one_img_url_str = 'http:' + one_img_url img_url_list.append(one_img_url_str) return img_url_list def deal_publish_user(publish_user_raw): if publish_user_raw: publish_user_raw = publish_user_raw[0].strip() publish_user_list = publish_user_raw.split(' ') return publish_user_list print('in parseMore') loader1 = ItemLoader(item=YfspiderspeakItem(), response=response) loader1.add_value('url', response.url) loader1.add_value('spider_time', time.time()) loader1.add_value('id', response.url.split('/')[-1].split('.')[0]) loader1.add_xpath('title', '//div[@class="col-left"]/div/center/h1/text()', lambda x: x[0].strip()) loader1.add_value( 'publish_time', response.xpath( '//div[@class="col-left"]/div/div[@class="fontsize"]/div/h2'). re('(\d{4}\-\d{2}\-\d{2} \d{2}\:\d{2})'), deal_publish_time) loader1.add_xpath( 'content', '//div[@class="article"]/div[@class="article_right"]/p/text()', deal_content) loader1.add_value( 'img_urls', response.xpath('//div[@class="article"]//img/@src').extract(), deal_img_urls) loader1.add_value( 'publish_user', response.xpath('//div[@class="col-left"]//div[@class="fontsize"]'). re(r'作者:(.*)\n?'), deal_publish_user) item1 = loader1.load_item() return item1
def parse_content(self, response): print('in parseMore') def deal_publish_time(publish_time_str): if publish_time_str: publish_time_str = publish_time_str.strip() else: return '2018-02-01 00:00:00' try: year = publish_time_str.split('/')[-3] mounth_days = publish_time_str.split('/')[-2] mount = mounth_days[0:2] days = mounth_days[2:4] return year + '-' + mount + '-' + days + ' 00:00:00' except Exception as e: return '2018-02-01 00:00:00' def deal_img_urls(img_urls_raw): img_urls_list = [] for one_img_url in img_urls_raw: if 'empty.gif' in one_img_url: continue else: if 'http' not in one_img_url: one_img_url = 'http:' + one_img_url img_urls_list.append(one_img_url) return img_urls_list loader1 = ItemLoader(item=YfspiderspeakItem(), response=response) loader1.add_value('url', response.url) loader1.add_value('spider_time', time.time()) loader1.add_xpath('title', '//div[@id="main"]//div[@id="Article"]/h1//text()', lambda x: ''.join([y for y in x])) loader1.add_xpath( 'content', '//div[@id="main"]//div[@id="Article"]//article[@id="content"]//text()', lambda x: [i.strip() for i in x], Join()) loader1.add_value('id', response.url.strip('/').split('/')[-1]) loader1.add_xpath( 'img_urls', '//div[@id="main"]//div[@id="Article"]//article[@id="content"]//img/@src|//div[@id="main"]//div[@id="Article"]//article[@id="content"]//img/@data-src', deal_img_urls) loader1.add_value('publish_time', response.url, deal_publish_time) loader1.add_xpath( 'publish_user', '//div[@id="main"]//div[@id="Article"]//p[@id="editor"]//b/text()') loader1.add_xpath( 'video_urls', '//div[@id="main"]//div[@id="Article"]//article[@id="content"]//iframe/@src' ) item = loader1.load_item() print(item) return item
def parse_content(self, response): print('has get one_website', response.url) def deal_publish_time(publish_time_raw_list): mounth_str = str(publish_time_raw_list[0]) day_str = str(publish_time_raw_list[1]) year_str = str(publish_time_raw_list[2]) mouth_transform = { 'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05', 'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10', 'November': '11', 'December': '12' } mounth_str_num = mouth_transform[mounth_str] publish_time_str = year_str + '-' + mounth_str_num + '-' + day_str + ' 00:00:00' return publish_time_str def deal_id(id): id_hash = hashlib.md5(id).hexdigest() return id_hash loaders1 = ItemLoader(response=response, item=YfspiderspeakItem()) loaders1.add_value('url', response.url) loaders1.add_value('spider_time', time.time()) loaders1.add_xpath( 'title', '//*[@id="the-post"]/div[@class="post-inner"]/h1/span/text()') loaders1.add_value( 'publish_time', response.xpath( '//*[@id="the-post"]//span[@class="tie-date"]/text()').re( r'(\S*) (\d{1,2})\, (\d{1,4})'), deal_publish_time) loaders1.add_xpath( 'content', '//div[@class="content"]//div[@class="entry"]//text()', Join()) loaders1.add_value( 'img_urls', response.xpath('//*[@id="the-post"]/div/div[@class="entry"]//img'). re(r'src="(.*?)"')) loaders1.add_value('id', response.url.split('chinese/')[1].strip('\/'), deal_id) item1 = loaders1.load_item() return item1
def parse_content(self, response): print('in parseMore') def deal_publish_time(publish_time_list=[]): if publish_time_list: publish_time_str = publish_time_list else: return '2018-02-01 00:00:00' try: mounth_str = publish_time_list[0] day = publish_time_str[1] year = publish_time_list[2] mouth_transform = { 'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05', 'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10', 'November': '11', 'December': '12' } mounth_num = mouth_transform[str(mounth_str).strip()] if len(str(day).strip()) < 2: day = '0' + str(day).strip() year = str(year) return year + '-' + mounth_num + '-' + day + ' 00:00:00' except: return '2018-02-01 00:00:00' loader1 = ItemLoader(item=YfspiderspeakItem(), response=response) loader1.add_value('url', response.url) loader1.add_value('spider_time', time.time()) loader1.add_xpath('title', '//div[@class="hideOnNavigation"]//h1//text()', lambda x: ''.join([y for y in x])) loader1.add_xpath( 'content', '//div[contains(@class,"newsContentArea")]//p//text()', lambda x: [i.strip() for i in x], Join()) loader1.add_value('id', response.url.strip('/').split('/')[-1]) loader1.add_xpath( 'img_urls', '//div[contains(@class,"newsContentArea")]//img/@src') loader1.add_xpath('publish_time', '//div[@class="hideOnNavigation"]//h1/span//text()', deal_publish_time) item = loader1.load_item() print(item) return item
def parse_content(self, response): def deal_publish_time(publish_time_raw): if publish_time_raw: publish_time_DMY = publish_time_raw[0].split(' ') Day_str = publish_time_DMY[1].replace('th,', '') Mounth_str = publish_time_DMY[0] Year_str = publish_time_DMY[2] mouth_transform = { 'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05', 'June': '06', 'July': '07', 'August': '08', 'September': '09', 'October': '10', 'November': '11', 'December': '12' } mounth_str_num = mouth_transform[str(Mounth_str)] return Year_str + '-' + mounth_str_num + '-' + Day_str + ' 00:00:00' else: return '1111-11-11 11:11:11' print(response.url) content_loader = ItemLoader(response=response, item=YfspiderspeakItem()) content_loader.add_value('url', response.url) content_loader.add_value('spider_time', time.time()) content_loader.add_value('id', response.url.strip('/').split('/')[-1]) content_loader.add_xpath( 'title', '//div[@id="content"]//h2[@class="entry-title"]//text()') content_loader.add_xpath( 'content', '//div[@id="content"]//div[@class="post-content"]//text()', Join()) content_loader.add_value( 'publish_time', response.xpath( '//div[@id="content"]//div[@class="fusion-meta-info"]//text()' ).re(r'\S* \d{1,2}th, \d{1,4}'), deal_publish_time) content_loader.add_xpath( 'img_urls', '//div[@id="main"]//div[@id="content"]//img/@src') item1 = content_loader.load_item() return item1
def parse_content(self, response): def deal_img_urls(img_urls_raw): img_urls_list = [] for one_img_url_raw in img_urls_raw: if 'mages/up' in one_img_url_raw or 'arrow_big_up' in one_img_url_raw: continue if 'verein' in response.url: img_url = 'http://www.chushigangdrug.ch/verein/' + one_img_url_raw.lstrip( '.') elif 'tanzgruppe' in response.url: img_url = 'http://www.chushigangdrug.ch/tanzgruppe/' + one_img_url_raw.strip( '.') elif 'galerie' in response.url: img_url = 'http://www.chushigangdrug.ch/galerie/' + one_img_url_raw.strip( '.') else: img_url = 'http://www.chushigangdrug.ch/' + one_img_url_raw.strip( '.') img_urls_list.append(img_url) return img_urls_list print(response.url) if response.xpath('//table//table[@class="titel"]//tr/td'): content_loader = ItemLoader(response=response, item=YfspiderspeakItem()) content_loader.add_value('url', response.url) content_loader.add_value('spider_time', time.time()) content_loader.add_value( 'id', response.url.strip('/').split('/')[-1].replace('.', '_')) content_loader.add_xpath( 'title', '//table//table[@class="titel"]//tr/td/text()', lambda x: x[0].strip()) content_loader.add_xpath( 'content', '//td[@valign="top"]//td[@valign and @bgcolor="#FFFFFF"]//text()', Join()) content_loader.add_value('publish_time', '2018-02-01 00:00:00') content_loader.add_xpath( 'img_urls', '//td[@valign="top"]//td[@valign and @bgcolor="#FFFFFF"]//img[@width>50]/@src', deal_img_urls) item1 = content_loader.load_item() return item1 else: print('unknown page')
def parse_content(self, response): ''' 这个函数在rulus中定义好了的一个callback,满足这个rules规则的所有链接的response都会交给这个方法来处理。 :param response: 是一个标准的scrapy的response对象,详细可以参看官方帮助文档。常用功能,response.xpath(),response.css(),response.xpath().re(),response.css().re()..... :return: 这是一个回调函数,返回的若是dict或者item(spiders文件夹同级目录中的items文件中定义的class),则会流经pipeline,若是requests,则会经过调度器调度,交给下载器去下载。 ''' print('in parseMore') def deal_publish_time(publish_time_list=[]): ''' 所有deal_...()这样的函数,都是为了处理函数名后边的字段的值。这就是这些函数的作用。相似的还有deal_publish_user,deal_img_urls...,使用回调的方式来使用这些函数。 :param publish_time_list: xpath解析出来的默认都是list, :return: 处理好了的时间格式的字段 ''' if publish_time_list: publish_time_str = publish_time_list[0] else: return '2018-02-01 00:00:00' if '+' in publish_time_str: publish_time_str_split = publish_time_str.split('+')[0] return publish_time_str_split.replace('T', ' ') else: return '2018-02-01 00:00:00' loader1 = ItemLoader(item=YfspiderspeakItem(), response=response) loader1.add_value('url', response.url) loader1.add_value('spider_time', time.time()) loader1.add_xpath('title', '//article//h2[@class="entry-title"]/text()', lambda x: ''.join([y for y in x])) loader1.add_xpath('content', '//article//div[@class="entry-content"]/p//text()', lambda x: [i.strip() for i in x], Join()) loader1.add_value('id', response.url.strip('/').split('/')[-1]) loader1.add_xpath('img_urls', '//article//div[@class="entry-content"]//img/@src') loader1.add_xpath('publish_time', '//article//time[@class="published"]/@datetime', deal_publish_time) loader1.add_xpath( 'publish_user', '//article//time[@class="published"]/a[@class="fn"]/text()') item = loader1.load_item() print(item) return item
def parse_content(self, response): print('in parseMore') def deal_publish_time(publish_time_str): if not publish_time_str: print('time is None') return None Re_find_publish_date = re.compile( 'articles\/(\d{4})\/(\d{1,2})\/(\d{1,2})') publish_date_list = Re_find_publish_date.findall(publish_time_str) date_str_list = [] for datestr in publish_date_list[ 0]: #findall找到的是一个包含tuple的list。找到的内容放到tuple中。 if len(datestr) < 2: datestr1 = '0' + str(datestr) date_str_list.append(datestr1) else: date_str_list.append(str(datestr)) publish_time = date_str_list[0] + '-' + date_str_list[ 1] + '-' + date_str_list[2] + ' 00:00:00' return publish_time loader1 = ItemLoader(item=YfspiderspeakItem(), response=response) loader1.add_value('url', response.url) loader1.add_value('spider_time', time.time()) loader1.add_xpath( 'title', '//div[@id="master_container"]//div[contains(@class,"articleTitle")]/h1/text()', TakeFirst(), lambda x: x.strip()) loader1.add_xpath( 'content', '//div[@id="master_container"]//div[contains(@class,"articleContent")]//text()', lambda x: [i.strip() for i in x], Join()) loader1.add_value('id', response.url.split('-')[-1].split('.')[0].strip()) loader1.add_xpath( 'img_urls', '//div[@id="master_container"]//div[contains(@id,"articleContent")]//img/@src' ) loader1.add_value('publish_time', response.url, deal_publish_time) # loader1.add_value('publish_user','degewa') # loader1.add_value('reply_count',response.selector.xpath('//*[@id="comments"]/h4/text()').re(ur'(\d{1,2}).*条评论'),lambda x:x[0] if x else 0) # loader1.add_value('reply_nodes',response.selector.re(ur'var items \= (\[.*?\])\;'),deal_reply_nodes) item = loader1.load_item() print(item) return item
def parse_content(self, response): print('in parseMore') def deal_publish_time(publish_time_list=[]): if publish_time_list: publish_time_str = publish_time_list[0] publish_time_str = publish_time_str[0] if type( publish_time_str) is type(()) else publish_time_str else: return '2018-02-01 00:00:00' if publish_time_str: return publish_time_str.replace('T', ' ') else: return '2018-02-01 00:00:00' loader1 = ItemLoader(item=YfspiderspeakItem(), response=response) loader1.add_value('url', response.url) loader1.add_value('spider_time', time.time()) loader1.add_xpath('title', '//div[@class="page-container"]//header/h1/text()', lambda x: ''.join([y for y in x])) loader1.add_xpath( 'content', '//div[@class="page-container"]//div[@itemprop="articleBody"]//text()', lambda x: [i.strip() for i in x], Join()) loader1.add_value('id', response.url.strip('/').split('/')[-1]) loader1.add_xpath( 'img_urls', '//div[@class="page-container"]//div[@itemprop="articleBody"]//img/@src' ) loader1.add_value( 'publish_time', response.xpath( '//head/script[@type="application/ld+json"]//text()'). re('\"dateCreated\"\:\"(\d{4}\-\d{2}\-\d{2}T\d{2}\:\d{2}\:\d{2})Z"' ), deal_publish_time) loader1.add_xpath( 'publish_user', '//article//time[@class="published"]/a[@class="fn"]/text()') loader1.add_xpath( 'video_urls', '//div[@class="page-container"]//div[@itemprop="articleBody"]//iframe/@src' ) item = loader1.load_item() print(item) return item
def parse_content(self, response): print('in parseMore') def deal_publish_time(publish_url_raw): publish_time = publish_url_raw.split('/')[-1] publish_time_split = publish_time.split('-') publish_time_dealed = publish_time_split[ 1] + '-' + publish_time_split[2] + '-' + publish_time_split[ 3] + ' ' + publish_time_split[4] + ':' + publish_time_split[ 5] + ':' + publish_time_split[6] return publish_time_dealed def deal_img_urls(img_urls_raw): img_urls = [] if img_urls_raw: for one_img_url in img_urls_raw: if 'www.kirti92.org' not in one_img_url: img_urls.append('http://www.kirti92.org' + one_img_url) else: img_urls.append(one_img_url) return img_urls loader1 = ItemLoader(item=YfspiderspeakItem(), response=response) loader1.add_value('url', response.url) loader1.add_value('spider_time', time.time()) loader1.add_xpath('title', '//div[@id="wrapper"]//h2/a/text()', TakeFirst(), lambda x: x.strip()) loader1.add_xpath( 'content', '//div[@id="centercontent_bg"]//div[@class="item-page"]/p//text()', lambda x: [i.strip() for i in x], Join()) loader1.add_value( 'id', response.url.strip('/').split('/')[-1].split('.')[0].strip()) loader1.add_xpath( 'img_urls', '//div[@id="centercontent_bg"]//div[@class="item-page"]//img/@src', deal_img_urls) loader1.add_value('publish_time', response.url, deal_publish_time) # loader1.add_value('publish_user','degewa') # loader1.add_value('reply_count',response.selector.xpath('//*[@id="comments"]/h4/text()').re(ur'(\d{1,2}).*条评论'),lambda x:x[0] if x else 0) # loader1.add_value('reply_nodes',response.selector.re(ur'var items \= (\[.*?\])\;'),deal_reply_nodes) item = loader1.load_item() print(item) return item
def parse_content(self, response): print('in parseMore') def deal_publish_time(publish_time_list=''): if publish_time_list: publish_time_str = publish_time_list.split('ntd.tv/')[1] else: return '2018-02-01 00:00:00' try: time_splited = publish_time_str.split('/') year = time_splited[0] mounth = time_splited[1] days = time_splited[2] return str(year) + '-' + str(mounth) + '-' + str( days) + ' 00:00:00' except: return '2018-02-01 00:00:00' loader1 = ItemLoader(item=YfspiderspeakItem(), response=response) loader1.add_value('url', response.url) loader1.add_value('spider_time', time.time()) loader1.add_xpath('title', '//main[@id="main"]//h1//text()', lambda x: ''.join([y for y in x]).strip()) loader1.add_xpath( 'content', '//main[@id="main"]//div[@class="left_block"]/div[contains(@class,"content")]//p//text()', lambda x: [i.strip() for i in x], Join()) loader1.add_value('id', response.url.strip('/').split('/')[-1]) loader1.add_xpath( 'img_urls', '//main[@id="main"]//div[@class="left_block"]/div[contains(@class,"content")]//img/@src' ) loader1.add_value('publish_time', response.url, deal_publish_time) loader1.add_xpath( 'publish_user', '//main[@id="main"]//div[@class="author"]//span[@class="author_name"]/text()' ) loader1.add_xpath( 'video_urls', '//main[@id="main"]//div[@class="left_block"]/div[contains(@class,"content") or contains(@class,"container")]//video/@src' ) item = loader1.load_item() print(item) return item
def parse_content(self, response): print('in parseMore') def deal_publish_time(publish_time_list=[]): if publish_time_list: publish_time_str = publish_time_list[0] else: return '2018-02-01 00:00:00' if '+' in publish_time_str: publish_time_str_split = publish_time_str.split('+')[0] return publish_time_str_split.replace('T', ' ') else: return '2018-02-01 00:00:00' loader1 = ItemLoader(item=YfspiderspeakItem(), response=response) loader1.add_value('url', response.url) loader1.add_value('spider_time', time.time()) loader1.add_xpath( 'title', '//div[@class="article-content clearfix"]//h1[@class="entry-title"]/text()', lambda x: ''.join([y.strip() for y in x])) loader1.add_xpath( 'content', '//div[@class="article-content clearfix"]//div[@class="entry-content clearfix"]/p//text()', lambda x: [i.strip() for i in x], Join()) loader1.add_value('id', response.url.strip('/').split('/')[-1]) loader1.add_xpath( 'img_urls', '//div[@class="article-content clearfix"]//div[@class="entry-content clearfix"]/p//img/@src' ) loader1.add_xpath( 'publish_time', '//div[@class="article-content clearfix"]//div[@class="below-entry-meta"]//time[@class="entry-date published"]/@datetime', deal_publish_time) loader1.add_xpath( 'publish_user', '//div[@class="article-content clearfix"]//div[@class="below-entry-meta"]/span[@class="byline"]//a[@class="url fn n"]/@title' ) loader1.add_xpath( 'read_count', '//div[@class="article-content clearfix"]//div[@class="below-entry-meta"]/span[@class="post-views"]//span[@class="total-views"]//text()' ) item = loader1.load_item() print(item) return item
def parse_content(self, response): def deal_publish_time(publish_time_url): print('in deal_publish_time', publish_time_url) try: publish_time_url = publish_time_url[0] except: return None publish_time_date_str = publish_time_url.split('org/')[1].strip( '/').replace('/', '-') publish_time = publish_time_date_str + ' 00:00:00' return publish_time def deal_id(id): id_hash = hashlib.md5(id).hexdigest() return id_hash print(response.url) content_loader = ItemLoader(item=YfspiderspeakItem(), response=response) content_loader.add_value('url', response.url) content_loader.add_value('spider_time', time.time()) content_loader.add_xpath( 'title', '//article[@id]/div[@class="gdlr-standard-style"]/div[@class="blog-content-wrapper"]/header/h1/text()' ) content_loader.add_xpath( 'content', '//article[@id]/div[@class="gdlr-standard-style"]/div[@class="blog-content-wrapper"]/div[@class="gdlr-blog-content"]/p/text()', Join()) content_loader.add_xpath( 'publish_time', '//article[@id]/div[@class="gdlr-standard-style"]/div[@class="blog-content-wrapper"]/header[@class="post-header"]/div[@class="gdlr-blog-info gdlr-info"]/div[@class="blog-info blog-date"]/a[@href]//@href', deal_publish_time) content_loader.add_value( 'img_urls', response.xpath('//div[@class="blog-content-wrapper"]//img').re( 'src="(.*?)"')) content_loader.add_value('id', response.url.strip('/').split('/')[-1], deal_id) item = content_loader.load_item() print(item) return item
def parse_content(self, response): print('in parseMore') def deal_publish_time(publish_time): if publish_time: #2017-03-29T11:52:42+00:00 try: publish_time_str_raw = publish_time[0] publish_time_splited = publish_time_str_raw.split('T') publish_date = publish_time_splited[0] publish_hours = publish_time_splited[1].split('+')[0] return publish_date + ' ' + publish_hours except Exception as e: print(e) return None else: return None loader1 = ItemLoader(item=YfspiderspeakItem(), response=response) loader1.add_value('url', response.url) loader1.add_value('spider_time', time.time()) loader1.add_xpath('title', '//h1[@class="entry-title"]/text()', lambda x: x[0].strip()) loader1.add_xpath('content', '//div[@class="td-post-content"]//p/text()', lambda x: [onegraph.strip() for onegraph in x], Join()) loader1.add_value( 'id', response.url.strip('/').split('/')[-1].split('.')[0].strip('/')) loader1.add_value( 'img_urls', response.xpath( '//div[@class="td-post-content"]//img/@src').extract()) loader1.add_xpath( 'publish_time', '//div[@class="td-module-meta-info"]/span/time/@datetime', deal_publish_time) loader1.add_xpath('video_urls', '//iframe[@gesture="media"]/@src') item = loader1.load_item() print(item) return item
def parse_content(self, response): print('in parseMore') def deal_publish_time(publish_time_list=[]): if publish_time_list: publish_time_str = publish_time_list else: return '2018-02-01 00:00:00' try: publish_time = publish_time_str[0] + '-' + publish_time_str[ 1] + '-' + publish_time_str[2] + ' 00:00:00' return publish_time except: return '2018-02-01 00:00:00' # def deal_publish_user(publish_user_list=[]): loader1 = ItemLoader(item=YfspiderspeakItem(), response=response) loader1.add_value('url', response.url) loader1.add_value('spider_time', time.time()) loader1.add_xpath('title', '//div[@id="bodyContent"]//h1/text()', lambda x: ''.join([y for y in x])) loader1.add_xpath( 'content', '//div[@id="bodyContent"]//div[@class="group"]//div/p//text()', lambda x: [i.strip() for i in x], Join()) loader1.add_value('id', response.url.strip('/').split('/')[-1]) loader1.add_xpath( 'img_urls', '//div[@id="bodyContent"]//div[@class="group"]//div/p//img/@src') loader1.add_value( 'publish_time', response.xpath( '//div[@id="bodyContent"]//div[@class="col3"]//div[@class="group"]/ul/li[1]//text()' ).re('(\d{2})\.(\d{2})\.(\d{4})'), deal_publish_time) loader1.add_value( 'publish_user', response.xpath( '//div[@id="bodyContent"]//div[@class="col3"]//div[@class="group"]/ul/li//strong[contains(text(),"作者")]/../text()' ).extract(), lambda x: ''.join([str(y).strip() for y in x])) item = loader1.load_item() print(item) return item
def parse_content(self, response): print('in parseMore') def deal_publish_time(publish_time_list=[]): if publish_time_list: publish_time_str = publish_time_list[0] else: return '2018-02-01 00:00:00' if '+' in publish_time_str: publish_time_str_split = publish_time_str.split('+')[0] return publish_time_str_split.replace('T', ' ') else: return '2018-02-01 00:00:00' loader1 = ItemLoader(item=YfspiderspeakItem(), response=response) loader1.add_value('url', response.url) loader1.add_value('spider_time', time.time()) loader1.add_xpath( 'title', '//div[@class="main"]//div[@class="article-main"]//h1//text()', lambda x: ''.join([y for y in x])) loader1.add_xpath( 'content', '//div[@class="main"]//section[@class="article-content"]//text()', lambda x: [i.strip() for i in x], Join()) loader1.add_value('id', response.url.strip('/').split('/')[-1]) loader1.add_xpath( 'img_urls', '//div[@class="main"]//section[@class="article-content"]//img/@src' ) loader1.add_xpath( 'publish_time', '//div[@class="main"]//dd[@class="published hasTooltip"]//time/@datetime', deal_publish_time) loader1.add_xpath( 'publish_user', '//div[@class="main"]//dd[@itemprop="author"]//span[@itemprop="name"]/text()' ) item = loader1.load_item() print(item) return item
def parse_content(self,response): print ('in parseMore') def deal_publish_time(publish_time_list=[]): if publish_time_list: publish_time_str=publish_time_list[0] else: return '2018-02-01 00:00:00' if '+' in publish_time_str: publish_time_str_split=publish_time_str.split('+')[0] return publish_time_str_split.replace('T',' ') else: return '2018-02-01 00:00:00' def deal_publish_user(publish_user_raw): if publish_user_raw: publish_user_str=''.join(publish_user_raw).strip() try: return publish_user_str.split(',')[:-1] except : return publish_user_raw else: return publish_user_raw loader1=ItemLoader(item=YfspiderspeakItem(),response=response) loader1.add_value('url',response.url) loader1.add_value('spider_time',time.time()) loader1.add_xpath('title','//div[@class="main"]//article//header//h1//text()',lambda x:''.join([y for y in x]).strip()) loader1.add_xpath('content','//div[@class="main"]//article//div[@class="article-content-main"]//p//text()',lambda x:[i.strip() for i in x],Join()) loader1.add_value('id',response.url.strip('/').split('/')[-1]) loader1.add_xpath('img_urls','//div[@class="main"]//article//div[@class="article-content-main"]//section/p//img/@src',lambda x:['http://www.thetibetpost.com'+y for y in x if 'www.thetibetpost.com' not in y]) loader1.add_xpath('publish_time','//div[@class="main"]//article//dd//time[@datetime]/@datetime',deal_publish_time) loader1.add_xpath('publish_user','//div[@class="main"]//article//dd[contains(@class,"createdby")]//span/text()',deal_publish_user) item=loader1.load_item() print (item) return item
def parse_content(self, response): def deal_id(id_raw): id_str = id_raw[0] return hashlib.md5(id_str).hexdigest() print(response.url) content_loader = ItemLoader(response=response, item=YfspiderspeakItem()) content_loader.add_value('url', response.url) content_loader.add_value('spider_time', time.time()) content_loader.add_xpath( 'title', '//div[@class="breadcrumb-inner"]/div[@class="subtitle"]/h2/text()' ) content_loader.add_xpath( 'content', '//div[@class="container"]//div[@class="detail_text rich_editor_text"]//p/text()', Join()) content_loader.add_xpath( 'publish_time', '//div[@class="container"]//ul[@class="post-options"]//time/@datetime', lambda x: x[0] + ' 00:00:00') content_loader.add_value('id', response.url.strip('/').split('/')[-1], deal_id) content_loader.add_xpath( 'publish_user', '//div[@class="container"]//ul[@class="post-options"]//li/i[@class="icon icon-user"]/ancestor::li/a[@href]/text()' ) content_loader.add_xpath( 'publish_user_id', '//div[@class="container"]//ul[@class="post-options"]//li/i[@class="icon icon-user"]/' 'ancestor::li/a[@href]/@href', lambda x: x[0].strip('/').split('/')[-1]) content_loader.add_value( 'img_urls', response.xpath( '//div[@id="main"]/div[@class="container"]/div[@class="row"]//img/@src' ).re(r'http://.*')) item1 = content_loader.load_item() return item1