def parse_news(self, response): try: article = response.meta['article'] #agency of news article['agency'] = u'朝日新聞' #title of news article['title'] = response.xpath('//*[@id="MainInner"]/div[1]/div/h1/text()').extract()[0] pos_1 = response.url.find('cles') pos_2 = response.url.find('.html') #get aid of news article['aid'] = response.url[pos_1 + 5: pos_2] date = response.xpath('//*[@id="MainInner"]/div[1]/div/p/text()').extract()[0] #get date of news article['date'] = date.replace(u'年','-').replace(u'月','-').replace(u'日',' ').replace(u'時',':').replace(u'分',':') #get contents of news contents = ''.join(response.xpath('//div[@class="ArticleText"]//text()').extract()).strip() #article['contents'] = ''.join(response.xpath('//div[@class="ArticleText"]//text()').extract()).strip() #Get keywords and tagged_text rake = jpRake() keywords_list = rake.run(contents) keywords = '\n'.join(keywords_list) tagged_text = rake.get_tagged_text() article['keywords'] = keywords article['tagged_text'] = tagged_text article['contents'] = contents yield article except Exception, e: print 'Parse_news ERROR!!!!!!!!!!!!! URL :'+ response.url print traceback.print_exc(file = sys.stdout)
def parse_news(self, response, video): time.sleep(1) try: article = response.meta['article'] news_url = response.url #determine the location of content based on whether there has a video if video == 0: contents = ' '.join( response.xpath( '//p[@class="ynDetailText"]//text()').extract()) article['category'] = response.xpath( '//div[@class="gnSecWrap"]//li[@class="current"]/a/text()' ).extract()[0] else: contents = ' '.join( response.xpath('//div[@class="ymuiContainerNopad"]//text()' ).extract()) article['category'] = response.xpath( '//div[@id="subNav"]/ul/li/a//span[@class="select"]/text()' ).extract()[0] content_1 = ''.join(contents).strip() #Get keywords and tagged_text rake = jpRake() keywords_list = rake.run(content_1) keywords = '\n'.join(keywords_list) tagged_text = rake.get_tagged_text() article['keywords'] = keywords article['tagged_text'] = tagged_text article['contents'] = content_1 yield article comment_url = response.url.replace( 'hl?a', 'cm/main?d') + '&s=create_time&o=desc&p=1' yield scrapy.Request(comment_url, callback=lambda response, news_url=news_url: self.get_comment_url(response, news_url)) except Exception, e: print 'Parse_news ERROR!!!!!!!!!!!!! URL :' + response.url print traceback.print_exc(file=sys.stdout)
def parse(self, response): try: self.driver.get('http://premium.yomiuri.co.jp/pc/#!/list_NEWS%255fMAIN') #print self.driver.title c_list = self.driver.find_elements_by_xpath( '//div[@class="yp_layout_template"]/div[@class="loft_article_sttl"]/a') c_list = [c.get_attribute("href") for c in c_list] for c in c_list: #self.parse_page(c) self.driver.get(c) time.sleep(5) url_list = self.driver.find_elements_by_xpath('//a[@class="yp_article_link"]') news_urls = [news.get_attribute("href") for news in url_list] for u in news_urls: self.driver.get(u) time.sleep(2) news_aid = u[u.find(u'/news_201')+1:u.find(u'/list_NEWS')] news_title = self.driver.find_element_by_xpath('//div[@class="yp_article_title"]').text news_date = self.driver.find_element_by_xpath('//div[@class="yp_article_credit"]').text news_date = time.strftime('%Y-%m-%d %H:%M:%S', self.parse_date(news_date)) news_content = self.driver.find_elements_by_xpath('//div[@class="yp_article_body"]') news_content = ''.join([content.text for content in news_content]) #print news_content rake = jpRake() keywords_list = rake.run(news_content) keywords = '\n'.join(keywords_list) tagged_text = rake.get_tagged_text() article = YomiuriArticleItem() article['url'] = u article['aid'] = news_aid article['title'] = news_title article['date'] = news_date article['contents'] = news_content article['keywords'] = keywords article['tagged_text'] = tagged_text #print article yield article finally: if self.driver != None: self.driver.close() self.driver.quit() if self.display != None: self.display.stop()
def parse_news(self, response): article = response.meta['article'] # news content news_content = response.xpath( './/div[@id="main"]//div[@class="main-text"]/p[@class="txt"]/text()').extract() news_content = ' '.join(news_content).strip() #Get keywords and tagged_text rake = jpRake() keywords_list = rake.run(news_content) keywords = '\n'.join(keywords_list) tagged_text = rake.get_tagged_text() article['keywords'] = keywords article['tagged_text'] = tagged_text article['contents'] = news_content #print article yield article
def parse_news(self, response): print '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' print response.url try: article = NikkeiArticleItem() #get title of news article['title'] = ''.join( response.xpath( '//h1[@class="cmn-article_title cmn-clearfix"]/span/text()' ).extract()).strip() #get contents of news #article['contents'] = ''.join(response.xpath('//div[@class="cmn-article_text JSID_key_fonttxt"]/p//text()').extract()).strip() contents = ''.join( response.xpath( '//div[@class="cmn-article_text JSID_key_fonttxt"]/p//text()' ).extract()).strip() #Get keywords and tagged_text rake = jpRake() keywords_list = rake.run(contents) keywords = '\n'.join(keywords_list) tagged_text = rake.get_tagged_text() article['contents'] = contents article['keywords'] = keywords article['tagged_text'] = tagged_text #get url of news article['url'] = response.url #set agency of news article['agency'] = u'日本経済新聞' #get aid of news pos = response.url.find('ng=') article['aid'] = response.url[pos + 3:] #get date of news date_time = self.date[0:4] + '-' + self.date[ 4:6] + '-' + self.date[6:] + ' ' + '03:00:00' article['date'] = date_time yield article except Exception, e: print 'Parse_news ERROR!!!!!!!!!!!!! URL :' + response.url print traceback.print_exc(file=sys.stdout)
def parse_news(self, response): article = response.meta['article'] # news content news_content = response.xpath( './/div[@id="main"]//div[@class="main-text"]/p[@class="txt"]/text()' ).extract() news_content = ' '.join(news_content).strip() #Get keywords and tagged_text rake = jpRake() keywords_list = rake.run(news_content) keywords = '\n'.join(keywords_list) tagged_text = rake.get_tagged_text() article['keywords'] = keywords article['tagged_text'] = tagged_text article['contents'] = news_content #print article yield article
def parse_news(self, response): try: article = response.meta['article'] #agency of news article['agency'] = u'朝日新聞' #title of news article['title'] = response.xpath( '//*[@id="MainInner"]/div[1]/div/h1/text()').extract()[0] pos_1 = response.url.find('cles') pos_2 = response.url.find('.html') #get aid of news article['aid'] = response.url[pos_1 + 5:pos_2] date = response.xpath( '//*[@id="MainInner"]/div[1]/div/p/text()').extract()[0] #get date of news article['date'] = date.replace(u'年', '-').replace( u'月', '-').replace(u'日', ' ').replace(u'時', ':').replace(u'分', ':') #get contents of news contents = ''.join( response.xpath( '//div[@class="ArticleText"]//text()').extract()).strip() #article['contents'] = ''.join(response.xpath('//div[@class="ArticleText"]//text()').extract()).strip() #Get keywords and tagged_text rake = jpRake() keywords_list = rake.run(contents) keywords = '\n'.join(keywords_list) tagged_text = rake.get_tagged_text() article['keywords'] = keywords article['tagged_text'] = tagged_text article['contents'] = contents yield article except Exception, e: print 'Parse_news ERROR!!!!!!!!!!!!! URL :' + response.url print traceback.print_exc(file=sys.stdout)
def parse_next_page(self, response): try: article = response.meta['article'] content = response.meta['contents'] content_1 = response.xpath('//*[@id="primary"]/section/article/div[2]/p/text()').extract() content_1_1 = ''.join(content_1) #merger this page's content with previous content content_2 = content + content_1_1 #Get keywords and tagged_text rake = jpRake() keywords_list = rake.run(content_2) keywords = '\n'.join(keywords_list) tagged_text = rake.get_tagged_text() article['keywords'] = keywords article['tagged_text'] = tagged_text article['contents'] = content_2 pos_1 = response.url.find("-n") pos_2 = response.url.find(".html") page_count = response.url[pos_1 + 2 : pos_2] next_page = response.xpath('//*[@id="primary"]/section/div/a[@class="pageNext"]/@href').extract()[0] #determine whether there has a next page if 'more' in next_page: yield article else: next_url = response.url[: pos_1 + 2] + str(int(page_count) + 1) + response.url[pos_2:] req = scrapy.Request(next_url, callback = self.parse_next_page) req.meta['article'] = article req.meta['contents'] = content_2 yield req except Exception, e: print 'Parse_next_page ERROR!!!!!!!!!!!!! :'+response.url print traceback.print_exc(file = sys.stdout)
def parse_news(self, response, video): time.sleep(1) try: article = response.meta['article'] news_url = response.url #determine the location of content based on whether there has a video if video == 0: contents = ' '.join(response.xpath('//p[@class="ynDetailText"]//text()').extract()) article['category'] = response.xpath('//div[@class="gnSecWrap"]//li[@class="current"]/a/text()').extract()[0] else: contents = ' '.join(response.xpath('//div[@class="ymuiContainerNopad"]//text()').extract()) article['category'] = response.xpath('//div[@id="subNav"]/ul/li/a//span[@class="select"]/text()').extract()[0] content_1 = ''.join(contents).strip() #Get keywords and tagged_text rake = jpRake() keywords_list = rake.run(content_1) keywords = '\n'.join(keywords_list) tagged_text = rake.get_tagged_text() article['keywords'] = keywords article['tagged_text'] = tagged_text article['contents'] = content_1 yield article comment_url = response.url.replace('hl?a','cm/main?d') + '&s=create_time&o=desc&p=1' yield scrapy.Request(comment_url, callback = lambda response, news_url = news_url: self.get_comment_url(response, news_url)) except Exception, e: print 'Parse_news ERROR!!!!!!!!!!!!! URL :'+ response.url print traceback.print_exc(file = sys.stdout)
def parse_news(self, response): print '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' print response.url try: article = NikkeiArticleItem() #get title of news article['title'] = ''.join(response.xpath('//h1[@class="cmn-article_title cmn-clearfix"]/span/text()').extract()).strip() #get contents of news #article['contents'] = ''.join(response.xpath('//div[@class="cmn-article_text JSID_key_fonttxt"]/p//text()').extract()).strip() contents = ''.join(response.xpath('//div[@class="cmn-article_text JSID_key_fonttxt"]/p//text()').extract()).strip() #Get keywords and tagged_text rake = jpRake() keywords_list = rake.run(contents) keywords = '\n'.join(keywords_list) tagged_text = rake.get_tagged_text() article['contents'] = contents article['keywords'] = keywords article['tagged_text'] = tagged_text #get url of news article['url'] = response.url #set agency of news article['agency'] = u'日本経済新聞' #get aid of news pos = response.url.find('ng=') article['aid'] = response.url[pos + 3:] #get date of news date_time = self.date[0:4] + '-' + self.date[4:6] +'-' + self.date[6:] + ' ' + '03:00:00' article['date'] = date_time yield article except Exception, e: print 'Parse_news ERROR!!!!!!!!!!!!! URL :'+ response.url print traceback.print_exc(file = sys.stdout)