def parse_news(self, response): try: print("parse:"+response.url) #print(str(response.css('.main-content *').extract())) item = NewsItem() item['url'] = response.url item['country_code'] = "2" item['country_name']="Brunei" item['source'] = "http://www.jpm.gov.bn" #content="".join(response.xpath('/html/body').extract()) #content = re.sub('div class="col-sm-12 hidden-print"[\w\W]+sidebar', '', content).strip() # image="".join(response.xpath('//div[@class="item-page"]/div[@itemprop="articleBody"]/p/img').extract()) # if image: # item['image']=image # else: srcs = "http://www.jpm.gov.bn" + str("".join(response.xpath( '//*[@id="WebPartWPQ3"]/table/tbody/tr[3]/td/table/tbody/tr[2]/td/div/p/img/@src').extract())) item['image_urls'] = srcs.split() #print("image:"+item['image']) item['content']="".join(response.xpath('//*[@id="WebPartWPQ3"]/table/tbody/tr[3]/td/table/tbody/tr[2]/td/div/div').extract()) #print("content:"+item['content']) item['title']="".join(response.xpath('//*[@id="WebPartWPQ3"]/table/tbody/tr[3]/td/table/tbody/tr[1]/td/text()').extract()) #print("title:" + item['title']) item['time']=None#"".join(response.xpath('//div[@class="content-detail"]/div[@class="date"]/ul/li/text()').extract()) #print("time:" + item['time']) item['crawled_time']=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) # #print(item['title']) yield item except Exception as error: log(error)
def parse_news(self, response): try: print("parse:" + response.url) #print(response.body) item = NewsItem() item['url'] = response.url item[ 'country_code'] = "6" #"".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Malaysia" # image="".join(response.xpath('//section[@id="block-views-newsroom-page-block-1"]').extract()) # if image: # item['image']=image # else: # item['image']=None item['image_urls'] = None item['content'] = "".join( response.xpath( '//*[@id="container_content"]/div[@class="editable"]'). extract()).replace('src="', 'src="http://www.miti.gov.my/') #print("content"+item['content']) item['source'] = "http://www.miti.gov.my" item['title'] = "".join( response.xpath( '//*[@id="365"]/div[2]/div[1]/h1/text()').extract()) item['time'] = "".join( response.xpath( '//*[@id="container_content"]/div[3]/p/em/text()').extract( )) item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # #print(item['title']) yield item except Exception as error: log(error)
def parse_news(self, response): try: print("parse:" + response.url) item = NewsItem() item['url'] = response.url item['country_code'] = "9" item['country_name'] = "Singapore" srcs = "".join( response.xpath('//*[@id="block-views-newsroom-page-block-1"]' '/div/div/div/div/div[2]/img/@src').extract()) item['image_urls'] = srcs.split() item['content'] = "<br /><br />".join( response.xpath('/div[@class="row qna"]' '/div/p/text()[name(..)!="img"]').extract()) item['source'] = "http://www.pmo.gov.sg" item['title'] = "".join( response.xpath( '//div[@class="breadcrumb"]' '/span[@class="inline even last"]/a/text()').extract()) item['time'] = "".join( response.xpath('//div[@class="col-sm-12 col-md-12"]' '/div[@class="meta-table"]/text()').extract()) item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) yield item except Exception as error: log(error)
def parse_news(self, response): try: print("parse:"+response.url) #print(response.body) item = NewsItem() item['url'] = response.url item['country_code'] = "7"#"".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name']="Myanmar" item['source']="http://www.commerce.gov.mm" # image="".join(response.xpath('//section[@id="block-views-newsroom-page-block-1"]').extract()) # if image: # item['image']=image # else: # item['image']=None item['image_urls'] = None item['content']= str("".join(response.xpath('//div[@class="field-item even"]').extract()).replace("img ", " ")) #print("content"+item['content']) item['title']="".join(response.xpath('//*[@id="page-title"]/text()').extract()) item['time']=None#"".join(response.xpath('//*[@id="container_content"]/div[3]/p/em/text()').extract()) item['crawled_time']=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) #print("title:" + item['title'])#print(item['title']) yield item except Exception as error: log(error)
def parse_news(self, response): try: print("parse:"+response.url) #print(str(response.css('.main-content *').extract())) item = NewsItem() item['url'] = response.url item['country_code'] = "8" item['country_name']="Philippines" #content="".join(response.xpath('/html/body').extract()) #content = re.sub('div class="col-sm-12 hidden-print"[\w\W]+sidebar', '', content).strip() # image="".join(response.xpath('//div[@class="item-page"]/div[@itemprop="articleBody"]/p/img').extract()) # if image: # item['image']=image # else: srcs = "https://www.dfa.gov.ph"+"".join(response.xpath('//div[@class="item-page"]/div[@itemprop="articleBody"]/p/img/@src').extract()) item['image_urls'] = srcs.split() #print("image:"+item['image']) item['content']="<br /><br />".join(response.xpath('//div[@class="item-page"]/div[@itemprop="articleBody"]/p/span').extract()).replace('src="','src="https://www.dfa.gov.ph') #print("content:"+item['content']) item['source']="https://www.dfa.gov.ph" item['title']="".join(response.xpath('//*[@id="banner"]/div/div/header/h1/text()').extract()) #print("title:" + item['title']) item['time']="".join(response.xpath('//*[@id="content"]/div[2]/div[2]/dl/dd[2]/time/text()').extract()) #print("time:" + item['time']) item['crawled_time']=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) #print(item['title']) yield item except Exception as error: log(error)
def parse_news(self, response): try: print("parse:" + response.url) item = NewsItem() item['url'] = response.url item[ 'country_code'] = "1" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "China" #print("country_code,country_name"+item['country_code']+item['country_name']) # image="".join(response.xpath('//section[@id="block-views-newsroom-page-block-1"]').extract()) # if image: # item['image']=image # else: # item['image']=None item['source'] = "http://www.soa.gov.cn" item['image_urls'] = None item['content'] = "<br />".join( response.xpath( '//div[@class="kuang_xiangqing"]/div/div[@class="TRS_Editor"]/p/font[name(..)!="img"][normalize-space()]' ).extract()) #print("content"+item['content']) item['title'] = "".join( response.xpath( '//div[@class="kuang_xiangqing"]/p[2]/text()').extract() ) #"".join(response.xpath('//div[@class="sf_colsOut contentIntroInternal"]/' # 'div[@class="sf_colsIn sf_1col_1in_100"]/h1/text()').extract()) item['time'] = "".join( response.xpath( '//div[@class="kuang_xiangqing"]/div[@class="subhead"]/text()' ).extract()) item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #print("title:" + item['title'])#print(item['title']) #print("time:" + item['time']) yield item except Exception as error: log(error)
def parse_news(self, response): try: print("parse:"+response.url) item = NewsItem() item['url'] = response.url item['country_code'] = "11"#"".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name']="Vietnam" item['image_urls']=None#"".join(response.xpath('//section[@id="block-views-newsroom-page-block-1"]').extract()) item['content']= str("".join(response.xpath('//*[@id="aspnetForm"]/div[3]' '/div[2]/div[2]/div[6]').extract()).replace('src="', 'src="http://cn.news.chinhphu.vn')) item['source']="http://cn.news.chinhphu.vn" item['title']="".join(response.xpath('//*[@id="ctl00_mainContent_bodyContent_lbHeadline"]/text()').extract()) item['time']="".join(response.xpath('//*[@id="ctl00_mainContent_bodyContent_lbDate"]/text()').extract()) item['crawled_time']=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) #print("title:" + item['title'])#print(item['title']) #print("time:" + item['time']) #print("content:" + item['content']) yield item except Exception as error: log(error)
def parse_news(self, response): try: print("parse:" + response.url) #print(str(response.css('.main-content *').extract())) item = NewsItem() item['url'] = response.url item['country_code'] = "3" item['country_name'] = "Cambodia" item['source'] = "https://www.mfaic.gov.kh" #content="".join(response.xpath('/html/body').extract()) #content = re.sub('div class="col-sm-12 hidden-print"[\w\W]+sidebar', '', content).strip() # image="".join(response.xpath('//div[@class="item-page"]/div[@itemprop="articleBody"]/p/img').extract()) # if image: # item['image']=image # else: item['image_urls'] = None #print("image:"+item['image']) item['content'] = "".join( response.xpath('//div[@class="content-detail"]').extract()) #print("content:"+item['content']) item['title'] = "".join( response.xpath( '//div[@class="content-detail"]/h4[@class="title-press"]/text()' ).extract()) #print("title:" + item['title']) item['time'] = "".join( response.xpath( '//div[@class="content-detail"]/div[@class="date"]/ul/li/text()' ).extract()) #print("time:" + item['time']) item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # #print(item['title']) yield item except Exception as error: log(error)
def parse_news(self, response): try: print("parse:" + response.url) #print(str(response.css('.main-content *').extract())) #print("".join(response.xpath('/html/body').extract())) item = NewsItem() item['url'] = response.url item[ 'country_code'] = "10" #"".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Thailand" #content="".join(response.xpath('/html/body').extract()) #content = re.sub('div class="col-sm-12 hidden-print"[\w\W]+sidebar', '', content).strip() # srcs="".join(response.xpath('//div[@class="col-xs-12 col-sm-6 padding-sm1 news-2"]/figure/img/@src').extract()) item['image_urls'] = None #print("image:"+item['image']) item['content'] = "".join( response.xpath( '//div[@class="border-normal clearfix"]').extract()) #print("content:"+item['content']) item['source'] = "http://www.thaigov.go.th" item['title'] = "".join( response.xpath( '//div[@class="col-xs-12 col-sm-6 padding-sm1 news-2"]/h3[@class="news-1 Circular color2"]/text()' ).extract()) #print("title:" + item['title']) item['time'] = "".join( response.xpath( '//*[@id="banner-group"]/div/div/div/div[1]/div[2]/div[1]/p[1]/span[2]/text()' ).extract()) #print("time:" + item['time']) item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #print(item['title']) yield item except Exception as error: log(error)
def parse_news(self, response): try: print("parse:" + response.url) country = response.meta["country"] #print(country) #print(response.body) item = NewsItem() item['url'] = response.url # if (country == 'China'): item[ 'country_code'] = "1" #"".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "China" elif (country == 'Brunei'): item[ 'country_code'] = "2" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Brunei" elif (country == 'Cambodia'): item[ 'country_code'] = "3" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Cambodia" elif (country == 'Indonesia'): item[ 'country_code'] = "4" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Indonesia" elif (country == 'Lao'): item[ 'country_code'] = "5" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Lao" elif (country == 'Malaysia'): item[ 'country_code'] = "6" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Malaysia" elif (country == 'Myanmar'): item[ 'country_code'] = "7" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Myanmar" elif (country == 'Philippine'): item[ 'country_code'] = "8" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Philippine" elif (country == 'Singapore'): item[ 'country_code'] = "9" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Singapore" elif (country == 'Thailand'): item[ 'country_code'] = "10" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Thailand" elif (country == 'Vietnam'): item[ 'country_code'] = "11" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Vietnam" #print("country_code,country_name"+item['country_code']+item['country_name']) item['source'] = "https://worldmaritimenews.com" # image="".join(response.xpath('//div[@id="post_thumbnail"]/a/img').extract()) # if image: # item['image']=image # else: # item['image']=None srcs = "".join( response.xpath('//*[@id="post_thumbnail"]/a/@href').extract()) item['image_urls'] = srcs.split() item['content'] = "<br />".join( Selector(response=response).xpath( '//div[@class="block-row"]/article/div[@class="content"]/p' ).extract()) #[normalize-space()] item['title'] = "".join( response.xpath( '//div[@class="block-row"]/article/header/h1/text()'). extract()) item['time'] = "".join( response.xpath( '//div[@class="block-row"]/article/footer/p/text()'). extract()) item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #print("title:" + item['title'])#print(item['title']) #print("time:" + item['time']) yield item except Exception as error: log(error)
def parse_news(self, response): try: print("parse:" + response.url) country = response.meta["country"] #print(country) #print(response.body) item = NewsItem() item['url'] = response.url # if (country == 'China'): item[ 'country_code'] = "1" #"".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "China" elif (country == 'Brunei'): item[ 'country_code'] = "2" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Brunei" elif (country == 'Cambodia'): item[ 'country_code'] = "3" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Cambodia" elif (country == 'Indonesia'): item[ 'country_code'] = "4" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Indonesia" elif (country == 'Lao'): item[ 'country_code'] = "5" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Lao" elif (country == 'Malaysia'): item[ 'country_code'] = "6" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Malaysia" elif (country == 'Myanmar'): item[ 'country_code'] = "7" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Myanmar" elif (country == 'Philippine'): item[ 'country_code'] = "8" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Philippine" elif (country == 'Singapore'): item[ 'country_code'] = "9" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Singapore" elif (country == 'Thailand'): item[ 'country_code'] = "10" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Thailand" elif (country == 'Vietnam'): item[ 'country_code'] = "11" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Vietnam" #print("country_code,country_name"+item['country_code']+item['country_name']) item['source'] = "http://www.seatrade-maritime.com" item['content'] = "".join( response.xpath( '//*[@id="k2Container"]/div[@class="itemBody"]/div[@class="contentBlock"]' ).extract()) # print("content"+item['content']) item['title'] = "".join( response.xpath( '//*[@id="k2Container"]/div[@class="itemHeader"]/h1/text()' ).extract()) item['time'] = "".join( response.xpath( '//*[@id="k2Container"]/div[@class="itemBody"]/span/text()' ).extract()) item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) srcs = "http://www.seatrade-maritime.com" + str("".join( response.xpath( '//*[@id="k2Container"]/div[@class="itemBody"]/div[@class="itemImageBlock"]/span[@class="itemImage"]/a/@href' ).extract())) item['image_urls'] = srcs.split() if (not srcs): item['image_urls'] = None yield item except Exception as error: log(error)
def parse_news(self, response): try: print("parse:" + response.url) country = response.meta["country"] #print(response.body) item = NewsItem() item['url'] = response.url # if (country == '中国'): item[ 'country_code'] = "1" #"".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "China" elif (country == '文莱'): item[ 'country_code'] = "2" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Brunei" elif (country == '柬埔寨'): item[ 'country_code'] = "3" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Cambodia" elif (country == '印度尼西亚'): item[ 'country_code'] = "4" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Indonesia" elif (country == '老挝'): item[ 'country_code'] = "5" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Lao" elif (country == '马来西亚'): item[ 'country_code'] = "6" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Malaysia" elif (country == '缅甸'): item[ 'country_code'] = "7" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Myanmar" elif (country == '菲律宾'): item[ 'country_code'] = "8" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Philippine" elif (country == '新加坡'): item[ 'country_code'] = "9" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Singapore" elif (country == '泰国'): item[ 'country_code'] = "10" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Thailand" elif (country == '越南'): item[ 'country_code'] = "11" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Vietnam" #print("country_code,country_name"+item['country_code']+item['country_name']) item['source'] = "http://www.eworldship.com" item['content'] = "".join( response.xpath('//*[@id="nArticle"]/div[3]').extract()) #print("content"+item['content']) item['title'] = "".join( response.xpath('//*[@id="nArticle"]/h1/text()').extract() ) #"".join(response.xpath('//div[@class="sf_colsOut contentIntroInternal"]/' # 'div[@class="sf_colsIn sf_1col_1in_100"]/h1/text()').extract()) item['time'] = "".join( response.xpath( '//*[@id="artical_sth"]/p/span[1]/text()').extract()) item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) srcs = response.xpath( '//*[@id="nArticle"]/div[@class="content"]/p/img/@src' ).extract() item['image_urls'] = srcs if (not srcs): item['image_urls'] = None # print("title:" + item['title'])#print(item['title']) # print("time:" + item['time']) return item except Exception as error: log(error)
def parse_news(self, response): try: print("parse:" + response.url) country = response.meta["country"] #print(country) #print(response.body) item = NewsItem() item['url'] = response.url # if (country == 'China'): item[ 'country_code'] = "1" #"".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "China" elif (country == 'Brunei'): item[ 'country_code'] = "2" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Brunei" elif (country == 'Cambodia'): item[ 'country_code'] = "3" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Cambodia" elif (country == 'Indonesia'): item[ 'country_code'] = "4" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Indonesia" elif (country == 'Lao'): item[ 'country_code'] = "5" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Lao" elif (country == 'Malaysia'): item[ 'country_code'] = "6" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Malaysia" elif (country == 'Myanmar'): item[ 'country_code'] = "7" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Myanmar" elif (country == 'Philippine'): item[ 'country_code'] = "8" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Philippine" elif (country == 'Singapore'): item[ 'country_code'] = "9" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Singapore" elif (country == 'Thailand'): item[ 'country_code'] = "10" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Thailand" elif (country == 'Vietnam'): item[ 'country_code'] = "11" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Vietnam" #print("country_code,country_name"+item['country_code']+item['country_name']) item['source'] = "https://www.marinelink.com" srcs = response.xpath( '//div[@class="innertube"]/article/div[@class="tmp8"]/a/@href' ).extract() item['image_urls'] = srcs if (not srcs): item['image_urls'] = None #print(item['image']) # item['image'] = None item['content'] = "".join( response.xpath( '//div[@class="innertube"]/article/div[@itemprop="text"]'). extract()) #print("content"+item['content']) item['title'] = "".join( response.xpath( '//div[@class="innertube"]/article/h1/text()').extract()) item['time'] = "".join( response.xpath( '//div[@class="innertube"]/article/p[@class="meta"]/span[@class="date"]/text()' ).extract()) item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # print("title:" + item['title'])#print(item['title']) # print("time:" + item['time']) yield item except Exception as error: log(error)
def parse_news(self, response): try: print("parse:" + response.url) country = response.meta["country"] print(country) #print(response.body) item = NewsItem() item['url'] = response.url # if (country == 'China'): item[ 'country_code'] = "1" #"".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "China" elif (country == 'Brunei'): item[ 'country_code'] = "2" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Brunei" elif (country == 'Cambodia'): item[ 'country_code'] = "3" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Cambodia" elif (country == 'Indonesia'): item[ 'country_code'] = "4" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Indonesia" elif (country == 'Lao'): item[ 'country_code'] = "5" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Lao" elif (country == 'Malaysia'): item[ 'country_code'] = "6" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Malaysia" elif (country == 'Myanmar'): item[ 'country_code'] = "7" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Myanmar" elif (country == 'Philippine'): item[ 'country_code'] = "8" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Philippine" elif (country == 'Singapore'): item[ 'country_code'] = "9" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Singapore" elif (country == 'Thailand'): item[ 'country_code'] = "10" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Thailand" elif (country == 'Vietnam'): item[ 'country_code'] = "11" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Vietnam" #print("country_code,country_name"+item['country_code']+item['country_name']) item['source'] = "http://www.ics-shipping.org" # image="".join(response.xpath('//section[@id="block-views-newsroom-page-block-1"]').extract()) # if image: # item['image']=image # else: # item['image']=None item['content'] = "".join( response.xpath( '//*[@id="MainContent_T2E4BE915010_Col00"]/div/div[2]'). extract()) #print("content"+item['content']) item['title'] = "".join( response.xpath('//title/text()').extract() ) #"".join(response.xpath('//div[@class="sf_colsOut contentIntroInternal"]/' # 'div[@class="sf_colsIn sf_1col_1in_100"]/h1/text()').extract()) item['time'] = "".join( response.xpath( '//*[@id="MainContent_T2E4BE915010_Col00"]/div/div[1]/text()' ).extract()) item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item['image_urls'] = None # print("title:" + item['title'])#print(item['title']) # print("time:" + item['time']) yield item except Exception as error: log(error)
def parse_news(self, response): try: print("parse:" + response.url) country = response.meta["country"] print(country) #print(response.body) item = NewsItem() item['url'] = response.url # if (country == 'China'): item[ 'country_code'] = "1" #"".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "China" elif (country == 'Brunei'): item[ 'country_code'] = "2" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Brunei" elif (country == 'Cambodia'): item[ 'country_code'] = "3" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Cambodia" elif (country == 'Indonesia'): item[ 'country_code'] = "4" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Indonesia" elif (country == 'Lao'): item[ 'country_code'] = "5" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Lao" elif (country == 'Malaysia'): item[ 'country_code'] = "6" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Malaysia" elif (country == 'Myanmar'): item[ 'country_code'] = "7" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Myanmar" elif (country == 'Philippine'): item[ 'country_code'] = "8" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Philippine" elif (country == 'Singapore'): item[ 'country_code'] = "9" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Singapore" elif (country == 'Thailand'): item[ 'country_code'] = "10" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Thailand" elif (country == 'Vietnam'): item[ 'country_code'] = "11" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Vietnam" #print("country_code,country_name"+item['country_code']+item['country_name']) item['source'] = "https://www.worldcargonews.com" content = "".join( response.xpath('//div[@class="ao-Article ao-CrIb"]').extract()) content.replace( '<img src="/AcuCustom/Sitename/Icon/Icons/wcnIconLinkedIn.svg" alt="Linked In" title="Linked In" class="aos-DS5-Image aos-FL aos-MW100">', " ") content.replace( '<img src="/AcuCustom/Sitename/Icon/Icons/wcnIconTwitter.svg" alt="Twitter" title="Twitter" class="aos-DS5-Image aos-FL aos-MW100">', ' ') content.replace( '<img src="/AcuCustom/Sitename/Icon/Icons/wcnIconFB.svg" alt="Facebook" title="Facebook" class="aos-DS5-Image aos-FL aos-MW100">', '') content.replace( '<img src="/AcuCustom/Sitename/Icon/Icons/wcnIconGoogleP.svg" alt="Google Plus" title="Google Plus" class="aos-DS5-Image aos-FL aos-MW100">', '') content.replace( '<img src="/AcuCustom/Sitename/DAM/002/HHMC_STS_Delivered_to_Port_Louis.jpg" alt="HHMC STS cranes in Mauritius" title="HHMC STS cranes in Mauritius" class="aos-DS5-Image aos-W100">', '') item['content'] = content # print("content"+item['content']) item['title'] = "".join( response.xpath( '//div[@class="ao-Article ao-CrIb"]/h1/text()').extract() ) #"".join(response.xpath('//div[@class="sf_colsOut contentIntroInternal"]/' # 'div[@class="sf_colsIn sf_1col_1in_100"]/h1/text()').extract()) item['time'] = "".join( response.xpath( '//span[@class="aos-ArticleDate aos-MRXS4 aos-NM aos-FL aos-DF"]/text()' ).extract()) item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item['image_urls'] = None yield item except Exception as error: log(error)
def parse_news(self, response): try: print("parse:" + response.url) country = response.meta["country"] #print(country) # print(response.body) item = NewsItem() item['url'] = response.url # if (country == 'China'): item['country_code'] = "1" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "China" elif (country == 'Brunei'): item['country_code'] = "2" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Brunei" elif (country == 'Cambodia'): item['country_code'] = "3" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Cambodia" elif (country == 'Indonesia'): item['country_code'] = "4" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Indonesia" elif (country == 'Lao'): item['country_code'] = "5" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Lao" elif (country == 'Malaysia'): item['country_code'] = "6" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Malaysia" elif (country == 'Myanmar'): item['country_code'] = "7" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Myanmar" elif (country == 'Philippine'): item['country_code'] = "8" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Philippine" elif (country == 'Singapore'): item['country_code'] = "9" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Singapore" elif (country == 'Thailand'): item['country_code'] = "10" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Thailand" elif (country == 'Vietnam'): item['country_code'] = "11" # "".join(response.xpath('//*[@property="v:summary"]/text()').extract()) item['country_name'] = "Vietnam" # print("country_code,country_name" + item['country_code'] + item['country_name']) item['source']="http://www.imo.org" # image="".join(response.xpath('//section[@id="block-views-newsroom-page-block-1"]').extract()) # if image: # item['image']=image # else: # item['image']=None item['image_urls'] = None content = str("".join(response.xpath('//*[@id="imo-pageLayout"]').extract()).replace("img "," ")) # content.replace("img "," ") item['content']=content #print("content"+item['content']) item['title']="".join(response.xpath('//*[@id="imo-pageLayout"]/h1/text()').extract()) item['time']=None#"".join(response.xpath('//*[@id="post-9384"]/div[2]/div/span[3]/text()').extract()) item['crawled_time']=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) # print("title:" + item['title'])#print(item['title']) # print("time:" + item['time']) yield item except Exception as error: log(error)