def parse_essay(self, response): socket.setdefaulttimeout(30) soup = BeautifulSoup(response.text, 'html.parser') flag = True for i in soup.select('.widget-listing '): # 静态文章列表 response.meta['title'] = i.select_one('a').get('title') mm = i.select_one('h5').text # 形如 '09 जनवरी 2021' ss = self.hindi_month[mm.split()[1]] + ' ' + mm.split()[0] + ' ' + mm.split()[2] # ss 形如 'Jan 09 2021' response.meta['pub_time'] = Util.format_time2(ss) if self.time == None or Util.format_time3(Util.format_time2(ss)) >= int(self.time): yield Request(url=i.select_one('a').get('href'), meta=response.meta, callback=self.parse_item) else: self.logger.info('时间截止!') try: # 尝试动态加载,(有的二级目录没有动态加载 i = 1 self.params['path'] = response.replace('https://www.aajtak.in', '') while True: if flag: self.params['id'] = str(i) i += 1 api_rq = requests.get(self.api_url, params=self.params, headers=self.headers) if api_rq.status_code == 200: soup = BeautifulSoup(api_rq.text, 'html.parser') for i in soup.select('.widget-listing '): # 动态加载的二级目录 response.meta['title'] = i.select_one('a').get('title') mm = i.select_one('h5').text # 形如 '09 जनवरी 2021' ss = self.hindi_month[mm.split()[1]]+' '+ mm.split()[0]+' '+mm.split()[2] # ss 形如 'Jan 09 2021' response.meta['pub_time'] = Util.format_time2(ss) if self.time == None or Util.format_time3(Util.format_time2(ss)) >= int(self.time): yield Request(url=i.select_one('a').get('href'),meta=response.meta, callback=self.parse_item) else: flag = False self.logger.info('时间截止!') except: self.logger.info('No more dynamic news loading!')
def parse_essay(self, response): soup = BeautifulSoup(response.text, 'html.parser') flag = True for i in soup.find_all(class_='amp-wp-content amp-loop-list'): tt = i.select_one( '.featured_time ').text.split() # 形如 ['2','दिन','ago' ] try: pub_time = tt[0] + ' ' + self.hindi_time_ago[tt[1]] + ' ' + tt[ 2] # 形如 '2 days ago' except: pub_time = Util.format_time(0) if self.time is None or Util.format_time3( Util.format_time2(pub_time)) >= int(self.time): # 未截止,True response.meta['title'] = i.select_one('h2').text response.meta['abstract'] = i.select_one( '.large-screen-excerpt-design-3').text response.meta['pub_time'] = Util.format_time2(pub_time) response.meta['images'] = [i.select_one('amp-img').get('src')] yield Request(url=i.select_one('a').get('href'), meta=response.meta, callback=self.parse_item) else: flag = False self.logger.info('时间截止') break if flag: nextPage = soup.select_one('#pagination a').get( 'href', 'Next Page No More') yield Request(nextPage, meta=response.meta, callback=self.parse_essay)
def parse_page(self, response): meta = {} soup = bs(response.text, "html.parser") category1 = soup.select(".breadcrumb > li")[1].text if soup.select( ".breadcrumb > li") else None if category1 == 'SPORTS' or category1 == 'OPINION' or category1 == 'TECH TREND': category2 = None else: category2 = soup.select(".breadcrumb > li")[2].text.strip( ) if soup.select(".breadcrumb > li")[2] else None meta["category1"] = category1 meta["category2"] = category2 self.logger.info(category1) self.logger.info(category2) for i in soup.find_all(class_="entry-title"): news_url = i.find("a").get("href") yield scrapy.Request(news_url, callback=self.parse_news, meta=meta) pub_time = soup.find_all(class_="meta-date")[-1].text.strip() self.logger.info((Util.format_time2(pub_time))) if self.time == None or Util.format_time3( Util.format_time2(pub_time)) >= int(self.time): url = soup.find( class_="next page-numbers").get("href") if soup.find( class_="next page-numbers") else None self.logger.info(url) if url: yield scrapy.Request(url, callback=self.parse_page) else: self.logger.info('时间截止')
def parse(self, response): # 新闻列表 有完整新闻 soup = BeautifulSoup(response.text, 'html.parser') flag = True last_pub_time= Util.format_time2(soup.select('#page-content-wrapper > div.container.section-1 > div > div.col-lg-9.col-md-9.col-sm-9.col-xs-12 > div > div h5')[-1].text) if self.time is None or Util.format_time3(last_pub_time) >= int(self.time): all_pub_time = [Util.format_time2(i.text) for i in soup.select('#page-content-wrapper > div.container.section-1 > div > div.col-lg-9.col-md-9.col-sm-9.col-xs-12 > div > div h5')] all_title = [i.text.strip() for i in soup.select('.lk-tle')] all_images = ['https://www.myanmarisis.org'+i.get('src') for i in soup.select('.img-responsive.lk-img')] all_body = [i.text.strip() for i in soup.select('#page-content-wrapper > div.container.section-1 > div > div.col-lg-9.col-md-9.col-sm-9.col-xs-12 > div p')] for i in range(9): item=DemoItem() item['pub_time'] = all_pub_time[i] item['images'] = [all_images[i]] item['title'] = all_title[i] item['body'] = all_body[i] item['category1']='event' item['category2'] =None item['abstract'] = all_body[i].split('\n')[0] yield item else: self.logger.info('时间截止!') flag = False if flag: try: nextPage=soup.select_one('.active ~ li a').get('href') yield Request(url=nextPage) except: self.logger.info("Next page no more.")
def parse2(self, response): html = BeautifulSoup(response.text) item = DemoItem() item['category1'] = response.meta['category1'] if response.meta['category1'] != 'life': item['title'] = html.select('#landing-headline > h1')[0].text item['body'] = '' flag = False for i in html.select('#article-content > p'): item['body'] += i.text if i.text != '' and flag == False: flag = True item['abstract'] = i.text item['pub_time'] = Util.format_time2( html.select('#m-pd2 > span')[-1].text) item['images'] = [] for i in html.select('#article-content img'): item['images'].append(i.attrs['src']) yield item else: item['title'] = html.select('#art-hgroup > h1')[0].text item['body'] = '' flag = False for i in html.select('#article-content > p'): item['body'] += (i.text + '\n') if i.text != '' and flag == False: flag = True item['abstract'] = i.text item['pub_time'] = Util.format_time2( html.select('.art-byline > span')[-1].text) item['images'] = [] for i in html.select('#article-content img'): item['images'].append(i.attrs['src']) yield item
def parse2(self, response): item = DemoItem() html = BeautifulSoup(response.text) list = response.url.split('/') item['title'] = html.select('.title')[0].text item['category1'] = list[3] if re.findall(r'\d+', list[4]) == []: item['category2'] = list[4] item['body'] = '' flag = False for i in html.select('#content-body-244757-498257 > p'): item['body'] += (i.text + '\n') if i.text != '' and flag == False: flag = True item['abstract'] = i.text if html.select('.dateLine > p') != []: item['pub_time'] = Util.format_time2( html.select('.dateLine > p')[0].text) elif html.select('.dateString') != []: item['pub_time'] = Util.format_time2( html.select('.dateString')[0].text) if html.select('.margin-bottom-15 img') != []: item['images'] = [ 'https://www.cnnphilippines.com' + html.select('.margin-bottom-15 img')[0].attrs['src'], ] yield item
def parse(self, response): soup = BeautifulSoup(response.text, 'html.parser') flag = True for i in soup.select('.post'): url = i.select_one('a').get('href') response.meta['title'] = i.select_one('h2 > a ').text response.meta['abstract'] = i.select_one('div.exceprt ').text response.meta['pub_time'] = Util.format_time2( i.select_one('.date').text) if self.time is None or Util.format_time3( Util.format_time2(i.select_one('.date').text)) >= int( self.time): yield Request(url, meta=response.meta, callback=self.parse_item) else: flag = False self.logger.info('时间截止') break if flag: try: nextPage = soup.find( class_='next page-numbers').get('href') if soup.find( class_='next page-numbers').get('href') else None if nextPage: yield Request(url=nextPage, meta=response.meta, callback=self.parse) except: self.logger.info('Next page no more!')
def parse_eassys(self, response): # 各类二级目录的文章的翻页和url爬取 soup = BeautifulSoup(response.text, 'html.parser') flag = True if re.match(r'.*photo-gallery.*', response.url): # 照片的 for t in soup.find_all(class_='col-sm-4 col-md-4 photo-photo-h'): try: url = 'https://zeenews.india.com' + t.select_one('a').get('href') except: continue response.meta['title'] = t.select_one('h3').text response.meta['images'] = [t.select_one('img').get('src')] response.meta['pub_time'] = t.select_one('.photo-date').text.strip() if self.time is None or Util.format_time3(Util.format_time2(t.select_one('.photo-date').text.strip())) >= int(self.time): yield Request(url, callback=self.parse_item_photo, meta=response.meta) else: flag = False self.logger.info('时间截止') break elif re.match(r'.*video.*', response.url): # 视频的 for i in soup.find_all(attrs={'class': 'mini-video mini-video-h margin-bt30px'}): # 该目录初始的文章 url = 'https://zeenews.india.com' + i.select_one('a').get('href') #self.logger.info( url) response.meta['images'] = [i.select_one('img').get('src')] response.meta['title'] = i.select_one('h3').text response.meta['pub_time'] = i.select_one('.date').text.strip() if self.time is None or Util.format_time3(Util.format_time2(i.select_one('span.date').text.strip())) >= int(self.time): yield Request(url, callback=self.parse_item_video, meta=response.meta) else: flag = False self.logger.info('时间截止') break else: for t in soup.find_all(class_='section-article margin-bt30px clearfix'): # 该目录初始的文章 url = 'https://zeenews.india.com' + t.select_one('a').get('href') response.meta['title'] = t.select_one('h3.margin-bt10px').text tt = t.select_one('span.date').text.strip().split() try: pub_time = self.hindi_month[tt[0]] +' '+tt[1]+' '+tt[2]+' '+tt[3]+' '+tt[5] except: pub_time = t.select_one('span.date').text.strip() response.meta['pub_time'] = pub_time response.meta['images'] = [t.select_one('img').get('src')] if self.time is None or Util.format_time3(Util.format_time2(pub_time)) >= int(self.time): yield Request(url=url, meta=response.meta, callback=self.parse_item) else: flag = False self.logger.info('时间截止') break if flag: try: nextPage = 'https://zeenews.india.com/'+ soup.find(class_='next last').select_one('a').get('href') yield Request(nextPage,callback=self.parse_eassys,meta=response.meta) except: self.logger.info('Next page no more!')
def parse_detail(self, response): item = DemoItem() html = BeautifulSoup(response.text, 'html.parser') item['category1'] = response.meta['category1'] item['category2'] = response.meta['category2'] if html.select_one("div.container h1") is not None: item['title'] = html.select_one("div.container h1").text item['body'] = '' if html.select("div.col-24 p"): bodies = html.select("div.col-24 p") b_list = [b.text for b in bodies] item['body'] = '\n'.join(b_list) item['abstract'] = bodies[0].text item['images'] = [] if html.select("div.col-24 figure img"): images = html.select("div.col-24 figure img") for i in images: item['images'].append(i['src']) if html.select_one("p.byline span.date") is not None: ex = 'Published on (.*)' pub_time = html.select_one("p.byline span.date").text pub_time = re.findall(ex, pub_time) if pub_time: pub_time = pub_time[0] pub_time = Util.format_time2(pub_time) item['pub_time'] = pub_time else: item['pub_time'] = Util.format_time() else: item['pub_time'] = Util.format_time() yield item
def parse2(self, response): html = BeautifulSoup(response.text) list = response.url.split('/') category1 = list[4] if len(list) > 5 and list[5] != 'page': category2 = list[5] else: category2 = '' for i in html.select( 'div[class="default-post-category-content post_box"] .default-category-image > a' ): yield Request(i.attrs['href'], meta={ 'category1': category1, 'category2': category2 }, callback=self.parse3) if html.select( '.previous_posts > a' ) != [] and (self.time == None or Util.format_time3( Util.format_time2( html.select( 'div[class="default-post-category-content post_box"] .post_date' )[-1].text)) >= int(self.time)): yield Request(html.select('.previous_posts > a')[0].attrs['href'], callback=self.parse2) else: self.logger.info('截止')
def get_next_page(self, response): soup = bs(response.text, "html.parser") item = response.meta["item"] div_list = soup.find_all( "div", class_="data-bg-hover data-bg data-bg-categorised") for div in div_list: article_url = div.select_one("a").get("href") # print(article_url) # last_time = soup.find_all("span",class_="item-metadata posts-date")[11].text.strip() yield scrapy.Request(article_url, callback=self.get_news_detail, meta={"item": item}) # 层与层之间通过meta参数传递数据 if self.time == None or Util.format_time3( Util.format_time2( soup.find_all("article")[-1].find( "span", class_="item-metadata posts-date").text.strip( ))) >= int(self.time): url = soup.find( "a", class_="next page-numbers").get("href") if soup.find( "a", class_="next page-numbers") else None if url: yield scrapy.Request(url, meta=response.meta, callback=self.get_next_page) else: self.logger.info('时间截止')
def parse_news(self, response): item = DemoItem() soup = bs(response.text, "html.parser") item["category1"] = response.meta["category1"] item["category2"] = response.meta["category2"] item["title"] = soup.find(class_="post-title entry-title").text.strip() item["pub_time"] = Util.format_time2( soup.find(class_="published timeago").text.strip()) content = soup.find(class_="post-body entry-content") images = [img.get("src") for img in content.find_all("img") ] if content.find_all("img") else [] item["images"] = images body1 = '' for div in content.find_all(dir="ltr"): body1 += (div.text.strip() + '\n') if body1 == '': body1 = content.text body = '' for b in body1.split("\n"): if b != '': body += (b + '\n') item["body"] = body item["abstract"] = body.split("\n")[0] yield item
def parse_item(self, response): soup = BeautifulSoup(response.text, 'html.parser') item = DemoItem() category = response.url.split('/')[-3].split('_') if len(category) == 3: item['category1'] = category[1] item['category2'] = category[2] else: item['category1'] = category[0] item['category2'] = category[1] item['title'] = soup.select_one('h1.entry-title').text item['pub_time'] = Util.format_time2( soup.select('span.td-post-date > time')[0].text) item['images'] = [ i.get('data-src') for i in soup.select('div.td-post-content img') ] item['abstract'] = soup.select('div.td-post-content > p')[0].text ss = '' for i in soup.select('div.td-post-content > p'): ss += i.text + r'\n' item['body'] = ss return item
def parse_news(self, response): item = DemoItem() soup = bs(response.text, "html.parser") item["category1"] = response.meta["category1"] item["category2"] = response.meta["category2"] pub_time = soup.find( "span", "entry-meta-date updated").find("a").text.strip() if soup.find( "span", "entry-meta-date updated") else "0000-00-00 00:00:00" if pub_time: item["pub_time"] = Util.format_time2(pub_time) div = soup.find("div", class_="entry-content clearfix") images = [img.get("src") for img in div.find_all("img") ] if div.find_all("img") else None item["images"] = images title = soup.find("h1", class_="entry-title").text.strip() item["title"] = title abstract1 = [a.text.strip() for a in div.find_all("li") ] if div.find_all("li") else div.find("p").text.strip() abstract = '' for a in abstract1: abstract += a item["abstract"] = abstract body = [p.text.strip() for p in div.find_all("p")] if div.find_all("p") else None body = "\n".join(body) item["body"] = body yield item
def parse3(self, response): html = BeautifulSoup(response.text) item = DemoItem() list = response.url.split('/') item['title'] = html.select('.news-title')[0].text item['category1'] = list[3] if re.findall(r'\d+', list[4]) == []: item['category2'] = list[4] item['body'] = '' for i in html.select('.article-content > p'): item['body'] += (i.text + '\n') if html.select('.article-content > p') != []: item['abstract'] = html.select('.article-content > p')[0].text self.logger.info( html.select('.timestamp-entry > .date-posted')[0].text) if html.select('.timestamp-entry > .date-posted') != []: item['pub_time'] = Util.format_time2( html.select('.timestamp-entry > .date-posted')[0].text) else: item['pub_time'] = Util.format_time() if html.select('.article-content > .embed-wrap img') != []: item['images'] = [ html.select('.article-content > .embed-wrap img') [0].attrs['src'], ] yield item
def parse_category2(self, response): soup = BeautifulSoup(response.text, 'lxml') url_list = soup.find_all('h2', class_='entry-title grid-title') for url in url_list: news_url = url.find('a').get('href') yield scrapy.Request(news_url, meta=response.meta, callback=self.parse_details) #截止时间 ddl = soup.find( 'time', class_='entry-date published').text.strip() #January 10, 2021 ddl = Util.format_time2(ddl) #2021-01-10 00:00:00 ddl = Util.format_time3(ddl) #1610208000 #翻页 if soup.find('a', class_='next page-numbers'): next_url = soup.find('a', class_='next page-numbers').get('href') if (self.time == None or ddl >= int(self.time)): yield scrapy.Request(next_url, meta=response.meta, callback=self.parse_category2) else: self.logger.info('时间截止')
def parse(self, response): soup = BeautifulSoup(response.text, 'html.parser') flag = True for i in soup.find_all( class_='td_module_10 td_module_wrap td-animation-stack'): url = i.select_one('h3 > a').get('href') response.meta['title'] = i.select_one('h3 > a ').text response.meta['abstract'] = i.select_one('div.td-excerpt').text response.meta['pub_time'] = i.select_one('.td-post-date').text if self.time == None or Util.format_time3( Util.format_time2( i.select_one('.td-post-date').text)) >= int(self.time): yield Request(url, meta=response.meta, callback=self.parse_item) else: flag = False self.logger.info('时间截止') if flag: try: nextPage = soup.find( class_='page-nav td-pb-padding-side').select('a')[-1].get( 'href') if soup.find( class_='page-nav td-pb-padding-side').select( 'a')[-1].get('href') else None if nextPage: yield Request(url=nextPage, meta=response.meta, callback=self.parse) except: self.logger.info('Next page no more!')
def parse_item(self, response): soup = BeautifulSoup(response.text, 'html.parser') item = DemoItem() item['title'] = response.meta['title'] item['category1'] = response.meta['category1'] item['abstract'] = response.meta['abstract'] item['images'] = response.meta['images'] item['category2'] = response.meta['category2'] if re.findall('headline', response.url): # 一般新闻 ss = '' for i in soup.select('.dit > p > b'): ss += i.text + '\n' try: ss += soup.select_one('.dit > p > span').text except: pass item['body'] = ss tt = soup.select_one('.colort').text.split( ) # 形如 ['Wednesday', '6', 'January', '2021', '02:12:12', 'PM'] tt = tt[2] + ' ' + tt[1] + ' ' + tt[3] + ' ' + tt[4] + ' ' + tt[ 5] # 形如 January 6 2021 02:12:12 PM item['pub_time'] = Util.format_time2(tt) elif re.findall('watchvid', response.url): # 视频新闻 item['body'] = soup.select_one('.dit > p').text item['pub_time'] = soup.select_one('.colort').text else: # 图片新闻 item['body'] = soup.select_one('.news_saa > p').text item['pub_time'] = Util.format_time(0) return item
def parse_details(self,response): item=DemoItem() soup=BeautifulSoup(response.text,'lxml') item['category1']=response.meta['category1'] item['category2']=response.meta['category2'] item['title']=soup.find('h1',class_='post-title entry-title').text.strip() if soup.find('h1',class_='post-title entry-title') else None item['body'] = ''#不能忘记初始化 item['abstract']='' if soup.select('.entry-content p,.entry-content h3'): body_list=soup.select('.entry-content p,.entry-content h3')#这个写法可以同时提取到多个不同的标签 for body in body_list: item['body'] += body.text.strip() item['body'] +='\n' item['abstract']=body_list[0].text.strip() item['images']=[] image_list=soup.select('.entry-content p>img,.single-featured-image>img')if soup.select('.entry-content p>img,.single-featured-image>img') else None if(image_list): for image in image_list: image=image.get('src') item['images'].append(image) pub=soup.find('span',class_='date meta-item tie-icon').text.strip() if soup.find('span',class_='date meta-item tie-icon') else None if(pub): pub=Util.format_time2(pub) item['pub_time']=pub yield item
def parse(self, response): meta = {} soup = bs(response.text, "html.parser") category1 = soup.find(class_="tdb-title-text").text.strip() meta["category1"] = category1 for i in soup.find_all( class_="td-module-container td-category-pos-above"): news_url = i.find( class_="entry-title td-module-title").find("a").get("href") category2 = i.find( class_="td-module-meta-info").select_one("a").text.strip() if category2 == category1: category2 = None meta["category2"] = category2 yield scrapy.Request(news_url, callback=self.parse_news, meta=meta) pub = soup.find_all( class_="entry-date updated td-module-date")[-1].text.strip() if self.time == None or Util.format_time3( Util.format_time2(pub)) >= int(self.time): url = soup.find( "div", "page-nav td-pb-padding-side").select("a")[-1].get("href") yield Request(url, callback=self.parse) else: self.logger.info('时间截止')
def get_news_detail(self, response): ''' :param response: x新闻正文response :return: 新闻页面详情信息 ''' item = response.meta["item"] soup = bs(response.text, "html.parser") title = soup.find("h1", class_="entry-title").text pub_time = Util.format_time2( soup.find("time", class_="entry-date updated td-module-date").text) image_list = [ img.get("src") for img in soup.find_all("img", class_="entry-thumb td-modal-image") ] if soup.find("img", class_="entry-thumb td-modal-image") else None body = '' for p in soup.find("div", class_="td-post-content").select("p"): body += (p.text + '\n') abstract = body.split("।")[0] item["title"] = title item["pub_time"] = pub_time item["images"] = image_list item["abstract"] = abstract item["body"] = body yield item
def get_next_page(self, response): item = response.meta["item"] soup = bs(response.text, "html.parser") #len(soup.find("div", class_="entry-crumbs").select("i"))代表有几级目录 item['category1'] = soup.find("div", class_="entry-crumbs").find("span",class_="td-bred-no-url-last").text if len(soup.find("div", class_="entry-crumbs").select("i")) == 1 else soup.find("div", class_="entry-crumbs").select("span")[-2].text item['category2'] = soup.find("div", class_="entry-crumbs").find("span",class_="td-bred-no-url-last").text if len(soup.find("div", class_="entry-crumbs").select("i")) != 1 else " " for h3 in soup.find("div", class_="td-big-grid-wrapper").select("h3"): article_url = h3.select_one("a").get("href") yield scrapy.Request(article_url,meta=response.meta,callback=self.get_news_detail) for h3 in soup.select_one("#td-outer-wrap > div.td-main-content-wrap > div > div > div.td-pb-span8.td-main-content > div").find_all("h3", class_="entry-title td-module-title"): article_url = h3.select_one("a").get("href") yield scrapy.Request(article_url,meta=response.meta,callback=self.get_news_detail) temp_time = soup.find_all("div",class_="td-block-span6")[-1].select("time")[-1].text if soup.find_all("div",class_="td-block-span6") else "January 1, 1970" if self.time == None or Util.format_time3(Util.format_time2(temp_time)) >= int(self.time): next_url = None if soup.find("div", class_="page-nav td-pb-padding-side"):#排除没有页面条的情况 #排除到最后一页的情况 next_url = soup.find("div",class_="page-nav td-pb-padding-side").select("a")[-1].get("href") if soup.find("div",class_="page-nav td-pb-padding-side").select("a")[-1].select("i") else None if next_url: yield scrapy.Request(next_url, meta=response.meta, callback=self.get_next_page) else: self.logger.info('时间截止')
def parse_news(self, response): item = DemoItem() soup = bs(response.text, "html.parser") item["category1"] = response.meta["category1"] item["category2"] = response.meta["category2"] title = soup.find(class_="tdb-title-text").text.strip() item["title"] = title pub_time = soup.find( class_="entry-date updated td-module-date").text.strip() item["pub_time"] = Util.format_time2(pub_time) images = [ soup.find("div", "tdb-block-inner td-fix-index").find("img").get("src") ] if soup.find("div", "tdb-block-inner td-fix-index") else None if soup.find_all("div", "wp-block-image"): for img in soup.find_all("div", "wp-block-image"): images.append(img.find("img").get("src")) item["images"] = images abstract = soup.select_one( "div.wpb_wrapper > div > div > p").text.strip() if soup.select_one( "div.wpb_wrapper > div > div > p") else None item["abstract"] = abstract body = soup.find( class_="tdb-caption-text").text.strip() + '\n' if soup.find( class_="tdb-caption-text") else '' for p in soup.select("div.wpb_wrapper > div > div > p"): body += (p.text.strip() + '\n') item["body"] = body self.logger.info(item) self.logger.info('\n') yield item
def parse_essay(self, response): soup = BeautifulSoup(response.text, 'html.parser') flag = True for i in soup.select('.CatNewsFirst_FirstNews '): tt = i.select_one('h1 ~ span').text.split('|')[0].strip() pub_time = Util.format_time2(tt.split()[1] + ' ' + tt.split()[0] + ' ' + tt.split()[2]) url = 'http://www.univarta.com' + i.select_one('a').get('href') response.meta['title'] = i.select_one('a').text response.meta['pub_time'] = pub_time try: response.meta['images'] = [i.select_one('img').get('src')] except: response.meta['images'] = [] response.meta['abstract'] = i.select_one('h1 ~ p').text if self.time is None or Util.format_time3(pub_time) >= int( self.time): yield Request(url=url, meta=response.meta, callback=self.parse_item) else: flag = False self.logger.info('时间截止') if flag: try: nextPage = response.meta['cate_url'] + soup.select_one( '.jp-current ~ a').get('href') self.logger.info(nextPage) yield Request(nextPage, meta=response.meta, callback=self.parse_essay) except: self.logger.info('Next page no more!')
def get_news_detail(self,response): ''' :param response: x新闻正文response :return: 新闻页面详情信息 ''' item = response.meta["item"] soup = bs(response.text, "html.parser") title = soup.find("header", class_="td-post-title").select_one("h1").text pub_time = soup.find("header", class_="td-post-title").select_one("time").text image_list = [] if soup.find("div", class_="td-post-featured-image"): image_list = [a.select_one("img").get("src") for a in soup.find("div", class_="td-post-featured-image").select("a")] body = '' part = soup.find("div", class_="td-post-content").select("p") if soup.find("div",class_="td-post-content").select("p") else soup.find("div", class_="td-post-content").select("div") for p in part: body += (p.text + '\n') abstract = body.split("।", 1)[0] item["title"] = title item["pub_time"] = Util.format_time2(pub_time) item["images"] = image_list item["abstract"] = abstract item["body"] = body yield item
def parse(self, response): html = BeautifulSoup(response.text) if response.url == 'https://www.sunstar.com.ph/Philippines': for i in html.select('.tablecenter > a')[0:8]: yield Request(i.attrs['href']) elif re.findall( r'https://www.sunstar.com.ph/article/\d+/\S+?/\S+?/\S+?', response.url) != []: item = DemoItem() list = response.url.split('/') item['title'] = html.select('.titleArticle > h1')[0].text item['category1'] = list[5] if re.findall(r'\d+', list[6]) == []: item['category2'] = list[6] item['body'] = html.select('.col-sm-11 p')[0].text item['abstract'] = html.select('.col-sm-11 p')[0].text item['pub_time'] = Util.format_time2( html.select('.articleDate')[0].text) if html.select('.imgArticle > img') != []: item['images'] = [ html.select('.imgArticle > img')[0].attrs['src'], ] yield item else: for i in html.select('.sectionTopWidget > div > div .ratio'): yield Request(i.attrs['href']) for i in html.select( '.moreSectionWidget > div > div a[class="title-C20 title blu-hover"]' ): yield Request(i.attrs['href'])
def get_news_detail(self, response): ''' :param response: x新闻正文response :return: 新闻页面详情信息 ''' item = response.meta["item"] soup = bs(response.text, "html.parser") title = soup.find("h1", class_="entry-title").text.strip() if soup.find( "h1", class_="entry-title") else None pub_time = Util.format_time2( soup.find("span", class_="item-metadata posts-date").text.strip()) image_list = [ soup.find("div", class_="entry-content").find( "figure", class_="wp-block-image size-large").select_one( "img").get("data-src") ] if soup.find("div", class_="entry-content").find( "figure", class_="wp-block-image size-large") else [] body = '' for p in soup.find("div", class_="entry-content").select("p"): body += (p.text.strip() + '\n') if soup.find("pre", class_="wp-block-code"): # for code in soup.find("pre", class_="wp-block-code"): body += soup.find("pre", class_="wp-block-code").text abstract = body.split('।')[0] # 摘要是文章的第一句话 item["title"] = title item["pub_time"] = pub_time item["images"] = image_list item["abstract"] = abstract item["body"] = body yield item
def parse_details(self,response): item=DemoItem() soup=BeautifulSoup(response.text,'lxml') item['category1']=response.meta['category1'] item['category2']=response.meta['category2'] item['title']=soup.find('h1',class_='entry-title').text.strip() if soup.find('h1',class_='entry-title') else None item['body'] = ''#不能忘记初始化 body_list=soup.find('div',class_='entry clearfix').select('p') if soup.find('div',class_='entry clearfix').select('p')else None for body in body_list: item['body'] += body.text.strip() item['body'] +='\n' item['abstract']=soup.find('div',class_='entry clearfix').select('p')[0].text.strip() if soup.find('div',class_='entry clearfix').select('p') else None item['images']=[] image_list=soup.find('div',class_='entry clearfix').select('p>img')if soup.find('div',class_='entry clearfix').select('p>img') else None if(image_list): for image in image_list: image=image.get('src') item['images'].append(image) pub=soup.find('span',class_='updated').text.strip() if soup.find('span',class_='updated').text.strip() else None if(pub): pub=Util.format_time2(pub) item['pub_time']=pub yield item
def get_next_page(self, response): item = response.meta["item"] soup = bs(response.text, "html.parser") div = soup.find("div", class_="twp-row") for article in div.find_all("a", class_="post-thumbnail"): news_url = article.get("href") item['category1'] = soup.find_all("li", class_="trail-item")[1].text item['category2'] = soup.find_all( "li", class_="trail-item" )[2].text if len(soup.find_all("li", class_="trail-item")) > 3 or ( soup.find_all("li", class_="trail-item")[-1].text.split(" ")[0] != "Page" and len(soup.find_all( "li", class_="trail-item")) == 3) else None yield scrapy.Request(news_url, meta=response.meta, callback=self.get_news_detail) if self.time == None or Util.format_time3( Util.format_time2( div.find_all("span", class_="item-metadata posts-date") [-1].text.strip())) >= int(self.time): next_url = soup.find( "a", class_="next page-numbers").get("href") if soup.find( "a", class_="next page-numbers") else None if next_url: yield scrapy.Request(next_url, meta=response.meta, callback=self.get_next_page) else: self.logger.info('时间截止')
def parse_news(self, response): soup = bs(response.text) item = DemoItem() item["pub_time"] = Util.format_time2( soup.select('.post-meta > span')[1].text) title = soup.find( "h1", class_="post-title entry-title").text.strip() if soup.find( "h1", class_="post-title entry-title") else None item["title"] = title image = [ soup.find("div", class_="single-post-thumb").find("img").get("src") ] if soup.find("div", class_="single-post-thumb") else None item["images"] = image category1 = soup.select_one( "#main-content > div > article > div > p > span:nth-child(3) > a" ).text.strip() item["category1"] = category1 item["category2"] = None abstract = soup.find( "div", class_="entry").find("p").text.strip() if soup.find( "div", class_="entry") else soup.find( "div", class_="entry").find("p").text.strip() item["abstract"] = abstract body = [ p.text.strip() for p in soup.find("div", class_="entry").find_all("p") ] if soup.find("div", class_="entry") else None body = "\n".join(body) item["body"] = body yield item