Exemplo n.º 1
0
    def parse_news(self, response):
        try:
            print("parse:"+response.url)
            #print(str(response.css('.main-content *').extract()))

            item = NewsItem()
            item['url'] = response.url
            item['country_code'] = "2"
            item['country_name']="Brunei"
            item['source'] = "http://www.jpm.gov.bn"
            #content="".join(response.xpath('/html/body').extract())
            #content = re.sub('div class="col-sm-12 hidden-print"[\w\W]+sidebar', '', content).strip()
            # image="".join(response.xpath('//div[@class="item-page"]/div[@itemprop="articleBody"]/p/img').extract())
            # if image:
            #     item['image']=image
            # else:
            srcs = "http://www.jpm.gov.bn" + str("".join(response.xpath(
                '//*[@id="WebPartWPQ3"]/table/tbody/tr[3]/td/table/tbody/tr[2]/td/div/p/img/@src').extract()))
            item['image_urls'] = srcs.split()
            #print("image:"+item['image'])
            item['content']="".join(response.xpath('//*[@id="WebPartWPQ3"]/table/tbody/tr[3]/td/table/tbody/tr[2]/td/div/div').extract())
            #print("content:"+item['content'])

            item['title']="".join(response.xpath('//*[@id="WebPartWPQ3"]/table/tbody/tr[3]/td/table/tbody/tr[1]/td/text()').extract())
            #print("title:" + item['title'])
            item['time']=None#"".join(response.xpath('//div[@class="content-detail"]/div[@class="date"]/ul/li/text()').extract())
            #print("time:" + item['time'])
            item['crawled_time']=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
            # #print(item['title'])
            yield item
        except Exception as error:
            log(error)
Exemplo n.º 2
0
 def parse_news(self, response):
     try:
         print("parse:" + response.url)
         #print(response.body)
         item = NewsItem()
         item['url'] = response.url
         item[
             'country_code'] = "6"  #"".join(response.xpath('//*[@property="v:summary"]/text()').extract())
         item['country_name'] = "Malaysia"
         # image="".join(response.xpath('//section[@id="block-views-newsroom-page-block-1"]').extract())
         # if image:
         #     item['image']=image
         # else:
         #     item['image']=None
         item['image_urls'] = None
         item['content'] = "".join(
             response.xpath(
                 '//*[@id="container_content"]/div[@class="editable"]').
             extract()).replace('src="', 'src="http://www.miti.gov.my/')
         #print("content"+item['content'])
         item['source'] = "http://www.miti.gov.my"
         item['title'] = "".join(
             response.xpath(
                 '//*[@id="365"]/div[2]/div[1]/h1/text()').extract())
         item['time'] = "".join(
             response.xpath(
                 '//*[@id="container_content"]/div[3]/p/em/text()').extract(
                 ))
         item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                              time.localtime(time.time()))
         # #print(item['title'])
         yield item
     except Exception as error:
         log(error)
Exemplo n.º 3
0
    def parse_news(self, response):
        try:
            print("parse:" + response.url)
            item = NewsItem()
            item['url'] = response.url
            item['country_code'] = "9"
            item['country_name'] = "Singapore"
            srcs = "".join(
                response.xpath('//*[@id="block-views-newsroom-page-block-1"]'
                               '/div/div/div/div/div[2]/img/@src').extract())
            item['image_urls'] = srcs.split()

            item['content'] = "<br /><br />".join(
                response.xpath('/div[@class="row qna"]'
                               '/div/p/text()[name(..)!="img"]').extract())
            item['source'] = "http://www.pmo.gov.sg"
            item['title'] = "".join(
                response.xpath(
                    '//div[@class="breadcrumb"]'
                    '/span[@class="inline even last"]/a/text()').extract())
            item['time'] = "".join(
                response.xpath('//div[@class="col-sm-12 col-md-12"]'
                               '/div[@class="meta-table"]/text()').extract())
            item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                 time.localtime(time.time()))
            yield item
        except Exception as error:
            log(error)
    def parse_news(self, response):
        try:
            print("parse:"+response.url)
            #print(response.body)
            item = NewsItem()
            item['url'] = response.url
            item['country_code'] = "7"#"".join(response.xpath('//*[@property="v:summary"]/text()').extract())
            item['country_name']="Myanmar"
            item['source']="http://www.commerce.gov.mm"
            # image="".join(response.xpath('//section[@id="block-views-newsroom-page-block-1"]').extract())
            # if image:
            #     item['image']=image
            # else:
            #     item['image']=None
            item['image_urls'] = None

            item['content']= str("".join(response.xpath('//div[@class="field-item even"]').extract()).replace("img ", " "))
            #print("content"+item['content'])

            item['title']="".join(response.xpath('//*[@id="page-title"]/text()').extract())
            item['time']=None#"".join(response.xpath('//*[@id="container_content"]/div[3]/p/em/text()').extract())
            item['crawled_time']=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
            #print("title:" + item['title'])#print(item['title'])
            yield item
        except Exception as error:
            log(error)
Exemplo n.º 5
0
    def parse_news(self, response):
        try:
            print("parse:"+response.url)
            #print(str(response.css('.main-content *').extract()))

            item = NewsItem()
            item['url'] = response.url
            item['country_code'] = "8"
            item['country_name']="Philippines"

            #content="".join(response.xpath('/html/body').extract())
            #content = re.sub('div class="col-sm-12 hidden-print"[\w\W]+sidebar', '', content).strip()
            # image="".join(response.xpath('//div[@class="item-page"]/div[@itemprop="articleBody"]/p/img').extract())
            # if image:
            #     item['image']=image
            # else:
            srcs = "https://www.dfa.gov.ph"+"".join(response.xpath('//div[@class="item-page"]/div[@itemprop="articleBody"]/p/img/@src').extract())
            item['image_urls'] = srcs.split()
            #print("image:"+item['image'])
            item['content']="<br /><br />".join(response.xpath('//div[@class="item-page"]/div[@itemprop="articleBody"]/p/span').extract()).replace('src="','src="https://www.dfa.gov.ph')
            #print("content:"+item['content'])
            item['source']="https://www.dfa.gov.ph"
            item['title']="".join(response.xpath('//*[@id="banner"]/div/div/header/h1/text()').extract())
            #print("title:" + item['title'])
            item['time']="".join(response.xpath('//*[@id="content"]/div[2]/div[2]/dl/dd[2]/time/text()').extract())
            #print("time:" + item['time'])
            item['crawled_time']=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
            #print(item['title'])
            yield item
        except Exception as error:
            log(error)
Exemplo n.º 6
0
    def parse_news(self, response):
        try:
            print("parse:" + response.url)

            item = NewsItem()
            item['url'] = response.url

            item[
                'country_code'] = "1"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
            item['country_name'] = "China"
            #print("country_code,country_name"+item['country_code']+item['country_name'])

            # image="".join(response.xpath('//section[@id="block-views-newsroom-page-block-1"]').extract())
            # if image:
            #     item['image']=image
            # else:
            #     item['image']=None
            item['source'] = "http://www.soa.gov.cn"
            item['image_urls'] = None
            item['content'] = "<br />".join(
                response.xpath(
                    '//div[@class="kuang_xiangqing"]/div/div[@class="TRS_Editor"]/p/font[name(..)!="img"][normalize-space()]'
                ).extract())
            #print("content"+item['content'])

            item['title'] = "".join(
                response.xpath(
                    '//div[@class="kuang_xiangqing"]/p[2]/text()').extract()
            )  #"".join(response.xpath('//div[@class="sf_colsOut contentIntroInternal"]/'
            # 'div[@class="sf_colsIn sf_1col_1in_100"]/h1/text()').extract())
            item['time'] = "".join(
                response.xpath(
                    '//div[@class="kuang_xiangqing"]/div[@class="subhead"]/text()'
                ).extract())
            item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                 time.localtime(time.time()))
            #print("title:" + item['title'])#print(item['title'])
            #print("time:" + item['time'])
            yield item
        except Exception as error:
            log(error)
    def parse_news(self, response):
        try:
            print("parse:"+response.url)
            item = NewsItem()
            item['url'] = response.url
            item['country_code'] = "11"#"".join(response.xpath('//*[@property="v:summary"]/text()').extract())
            item['country_name']="Vietnam"

            item['image_urls']=None#"".join(response.xpath('//section[@id="block-views-newsroom-page-block-1"]').extract())

            item['content']= str("".join(response.xpath('//*[@id="aspnetForm"]/div[3]'
                                                        '/div[2]/div[2]/div[6]').extract()).replace('src="', 'src="http://cn.news.chinhphu.vn'))
            item['source']="http://cn.news.chinhphu.vn"
            item['title']="".join(response.xpath('//*[@id="ctl00_mainContent_bodyContent_lbHeadline"]/text()').extract())
            item['time']="".join(response.xpath('//*[@id="ctl00_mainContent_bodyContent_lbDate"]/text()').extract())
            item['crawled_time']=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
            #print("title:" + item['title'])#print(item['title'])
            #print("time:" + item['time'])
            #print("content:" + item['content'])
            yield item
        except Exception as error:
            log(error)
Exemplo n.º 8
0
    def parse_news(self, response):
        try:
            print("parse:" + response.url)
            #print(str(response.css('.main-content *').extract()))

            item = NewsItem()
            item['url'] = response.url
            item['country_code'] = "3"
            item['country_name'] = "Cambodia"
            item['source'] = "https://www.mfaic.gov.kh"
            #content="".join(response.xpath('/html/body').extract())
            #content = re.sub('div class="col-sm-12 hidden-print"[\w\W]+sidebar', '', content).strip()
            # image="".join(response.xpath('//div[@class="item-page"]/div[@itemprop="articleBody"]/p/img').extract())
            # if image:
            #     item['image']=image
            # else:
            item['image_urls'] = None
            #print("image:"+item['image'])
            item['content'] = "".join(
                response.xpath('//div[@class="content-detail"]').extract())
            #print("content:"+item['content'])

            item['title'] = "".join(
                response.xpath(
                    '//div[@class="content-detail"]/h4[@class="title-press"]/text()'
                ).extract())
            #print("title:" + item['title'])
            item['time'] = "".join(
                response.xpath(
                    '//div[@class="content-detail"]/div[@class="date"]/ul/li/text()'
                ).extract())
            #print("time:" + item['time'])
            item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                 time.localtime(time.time()))
            # #print(item['title'])
            yield item
        except Exception as error:
            log(error)
Exemplo n.º 9
0
    def parse_news(self, response):
        try:
            print("parse:" + response.url)
            #print(str(response.css('.main-content *').extract()))
            #print("".join(response.xpath('/html/body').extract()))
            item = NewsItem()
            item['url'] = response.url
            item[
                'country_code'] = "10"  #"".join(response.xpath('//*[@property="v:summary"]/text()').extract())
            item['country_name'] = "Thailand"
            #content="".join(response.xpath('/html/body').extract())
            #content = re.sub('div class="col-sm-12 hidden-print"[\w\W]+sidebar', '', content).strip()
            # srcs="".join(response.xpath('//div[@class="col-xs-12 col-sm-6 padding-sm1 news-2"]/figure/img/@src').extract())
            item['image_urls'] = None

            #print("image:"+item['image'])
            item['content'] = "".join(
                response.xpath(
                    '//div[@class="border-normal clearfix"]').extract())
            #print("content:"+item['content'])
            item['source'] = "http://www.thaigov.go.th"
            item['title'] = "".join(
                response.xpath(
                    '//div[@class="col-xs-12 col-sm-6 padding-sm1 news-2"]/h3[@class="news-1 Circular color2"]/text()'
                ).extract())
            #print("title:" + item['title'])
            item['time'] = "".join(
                response.xpath(
                    '//*[@id="banner-group"]/div/div/div/div[1]/div[2]/div[1]/p[1]/span[2]/text()'
                ).extract())
            #print("time:" + item['time'])
            item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                 time.localtime(time.time()))
            #print(item['title'])
            yield item
        except Exception as error:
            log(error)
    def parse_news(self, response):
        try:
            print("parse:" + response.url)
            country = response.meta["country"]
            #print(country)
            #print(response.body)
            item = NewsItem()
            item['url'] = response.url
            #
            if (country == 'China'):
                item[
                    'country_code'] = "1"  #"".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "China"
            elif (country == 'Brunei'):
                item[
                    'country_code'] = "2"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Brunei"
            elif (country == 'Cambodia'):
                item[
                    'country_code'] = "3"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Cambodia"
            elif (country == 'Indonesia'):
                item[
                    'country_code'] = "4"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Indonesia"
            elif (country == 'Lao'):
                item[
                    'country_code'] = "5"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Lao"
            elif (country == 'Malaysia'):
                item[
                    'country_code'] = "6"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Malaysia"
            elif (country == 'Myanmar'):
                item[
                    'country_code'] = "7"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Myanmar"
            elif (country == 'Philippine'):
                item[
                    'country_code'] = "8"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Philippine"
            elif (country == 'Singapore'):
                item[
                    'country_code'] = "9"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Singapore"
            elif (country == 'Thailand'):
                item[
                    'country_code'] = "10"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Thailand"
            elif (country == 'Vietnam'):
                item[
                    'country_code'] = "11"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Vietnam"
            #print("country_code,country_name"+item['country_code']+item['country_name'])

            item['source'] = "https://worldmaritimenews.com"
            # image="".join(response.xpath('//div[@id="post_thumbnail"]/a/img').extract())
            # if image:
            #     item['image']=image
            # else:
            #     item['image']=None
            srcs = "".join(
                response.xpath('//*[@id="post_thumbnail"]/a/@href').extract())
            item['image_urls'] = srcs.split()

            item['content'] = "<br />".join(
                Selector(response=response).xpath(
                    '//div[@class="block-row"]/article/div[@class="content"]/p'
                ).extract())
            #[normalize-space()]

            item['title'] = "".join(
                response.xpath(
                    '//div[@class="block-row"]/article/header/h1/text()').
                extract())
            item['time'] = "".join(
                response.xpath(
                    '//div[@class="block-row"]/article/footer/p/text()').
                extract())
            item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                 time.localtime(time.time()))
            #print("title:" + item['title'])#print(item['title'])
            #print("time:" + item['time'])
            yield item
        except Exception as error:
            log(error)
Exemplo n.º 11
0
    def parse_news(self, response):
        try:
            print("parse:" + response.url)
            country = response.meta["country"]
            #print(country)
            #print(response.body)
            item = NewsItem()
            item['url'] = response.url
            #
            if (country == 'China'):
                item[
                    'country_code'] = "1"  #"".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "China"
            elif (country == 'Brunei'):
                item[
                    'country_code'] = "2"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Brunei"
            elif (country == 'Cambodia'):
                item[
                    'country_code'] = "3"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Cambodia"
            elif (country == 'Indonesia'):
                item[
                    'country_code'] = "4"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Indonesia"
            elif (country == 'Lao'):
                item[
                    'country_code'] = "5"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Lao"
            elif (country == 'Malaysia'):
                item[
                    'country_code'] = "6"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Malaysia"
            elif (country == 'Myanmar'):
                item[
                    'country_code'] = "7"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Myanmar"
            elif (country == 'Philippine'):
                item[
                    'country_code'] = "8"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Philippine"
            elif (country == 'Singapore'):
                item[
                    'country_code'] = "9"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Singapore"
            elif (country == 'Thailand'):
                item[
                    'country_code'] = "10"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Thailand"
            elif (country == 'Vietnam'):
                item[
                    'country_code'] = "11"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Vietnam"
            #print("country_code,country_name"+item['country_code']+item['country_name'])

            item['source'] = "http://www.seatrade-maritime.com"

            item['content'] = "".join(
                response.xpath(
                    '//*[@id="k2Container"]/div[@class="itemBody"]/div[@class="contentBlock"]'
                ).extract())
            # print("content"+item['content'])

            item['title'] = "".join(
                response.xpath(
                    '//*[@id="k2Container"]/div[@class="itemHeader"]/h1/text()'
                ).extract())
            item['time'] = "".join(
                response.xpath(
                    '//*[@id="k2Container"]/div[@class="itemBody"]/span/text()'
                ).extract())
            item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                 time.localtime(time.time()))

            srcs = "http://www.seatrade-maritime.com" + str("".join(
                response.xpath(
                    '//*[@id="k2Container"]/div[@class="itemBody"]/div[@class="itemImageBlock"]/span[@class="itemImage"]/a/@href'
                ).extract()))
            item['image_urls'] = srcs.split()
            if (not srcs):
                item['image_urls'] = None

            yield item
        except Exception as error:
            log(error)
Exemplo n.º 12
0
    def parse_news(self, response):
        try:
            print("parse:" + response.url)
            country = response.meta["country"]
            #print(response.body)
            item = NewsItem()
            item['url'] = response.url
            #
            if (country == '中国'):
                item[
                    'country_code'] = "1"  #"".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "China"
            elif (country == '文莱'):
                item[
                    'country_code'] = "2"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Brunei"
            elif (country == '柬埔寨'):
                item[
                    'country_code'] = "3"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Cambodia"
            elif (country == '印度尼西亚'):
                item[
                    'country_code'] = "4"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Indonesia"
            elif (country == '老挝'):
                item[
                    'country_code'] = "5"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Lao"
            elif (country == '马来西亚'):
                item[
                    'country_code'] = "6"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Malaysia"
            elif (country == '缅甸'):
                item[
                    'country_code'] = "7"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Myanmar"
            elif (country == '菲律宾'):
                item[
                    'country_code'] = "8"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Philippine"
            elif (country == '新加坡'):
                item[
                    'country_code'] = "9"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Singapore"
            elif (country == '泰国'):
                item[
                    'country_code'] = "10"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Thailand"
            elif (country == '越南'):
                item[
                    'country_code'] = "11"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Vietnam"
            #print("country_code,country_name"+item['country_code']+item['country_name'])

            item['source'] = "http://www.eworldship.com"
            item['content'] = "".join(
                response.xpath('//*[@id="nArticle"]/div[3]').extract())
            #print("content"+item['content'])

            item['title'] = "".join(
                response.xpath('//*[@id="nArticle"]/h1/text()').extract()
            )  #"".join(response.xpath('//div[@class="sf_colsOut contentIntroInternal"]/'
            # 'div[@class="sf_colsIn sf_1col_1in_100"]/h1/text()').extract())
            item['time'] = "".join(
                response.xpath(
                    '//*[@id="artical_sth"]/p/span[1]/text()').extract())
            item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                 time.localtime(time.time()))

            srcs = response.xpath(
                '//*[@id="nArticle"]/div[@class="content"]/p/img/@src'
            ).extract()
            item['image_urls'] = srcs
            if (not srcs):
                item['image_urls'] = None

            # print("title:" + item['title'])#print(item['title'])
            # print("time:" + item['time'])
            return item
        except Exception as error:
            log(error)
Exemplo n.º 13
0
    def parse_news(self, response):
        try:
            print("parse:" + response.url)
            country = response.meta["country"]
            #print(country)
            #print(response.body)
            item = NewsItem()
            item['url'] = response.url
            #
            if (country == 'China'):
                item[
                    'country_code'] = "1"  #"".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "China"
            elif (country == 'Brunei'):
                item[
                    'country_code'] = "2"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Brunei"
            elif (country == 'Cambodia'):
                item[
                    'country_code'] = "3"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Cambodia"
            elif (country == 'Indonesia'):
                item[
                    'country_code'] = "4"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Indonesia"
            elif (country == 'Lao'):
                item[
                    'country_code'] = "5"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Lao"
            elif (country == 'Malaysia'):
                item[
                    'country_code'] = "6"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Malaysia"
            elif (country == 'Myanmar'):
                item[
                    'country_code'] = "7"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Myanmar"
            elif (country == 'Philippine'):
                item[
                    'country_code'] = "8"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Philippine"
            elif (country == 'Singapore'):
                item[
                    'country_code'] = "9"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Singapore"
            elif (country == 'Thailand'):
                item[
                    'country_code'] = "10"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Thailand"
            elif (country == 'Vietnam'):
                item[
                    'country_code'] = "11"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Vietnam"
            #print("country_code,country_name"+item['country_code']+item['country_name'])

            item['source'] = "https://www.marinelink.com"

            srcs = response.xpath(
                '//div[@class="innertube"]/article/div[@class="tmp8"]/a/@href'
            ).extract()
            item['image_urls'] = srcs
            if (not srcs):
                item['image_urls'] = None
            #print(item['image'])
            # item['image'] = None
            item['content'] = "".join(
                response.xpath(
                    '//div[@class="innertube"]/article/div[@itemprop="text"]').
                extract())
            #print("content"+item['content'])

            item['title'] = "".join(
                response.xpath(
                    '//div[@class="innertube"]/article/h1/text()').extract())
            item['time'] = "".join(
                response.xpath(
                    '//div[@class="innertube"]/article/p[@class="meta"]/span[@class="date"]/text()'
                ).extract())
            item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                 time.localtime(time.time()))
            # print("title:" + item['title'])#print(item['title'])
            # print("time:" + item['time'])
            yield item
        except Exception as error:
            log(error)
    def parse_news(self, response):
        try:
            print("parse:" + response.url)
            country = response.meta["country"]
            print(country)
            #print(response.body)
            item = NewsItem()
            item['url'] = response.url
            #
            if (country == 'China'):
                item[
                    'country_code'] = "1"  #"".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "China"
            elif (country == 'Brunei'):
                item[
                    'country_code'] = "2"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Brunei"
            elif (country == 'Cambodia'):
                item[
                    'country_code'] = "3"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Cambodia"
            elif (country == 'Indonesia'):
                item[
                    'country_code'] = "4"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Indonesia"
            elif (country == 'Lao'):
                item[
                    'country_code'] = "5"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Lao"
            elif (country == 'Malaysia'):
                item[
                    'country_code'] = "6"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Malaysia"
            elif (country == 'Myanmar'):
                item[
                    'country_code'] = "7"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Myanmar"
            elif (country == 'Philippine'):
                item[
                    'country_code'] = "8"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Philippine"
            elif (country == 'Singapore'):
                item[
                    'country_code'] = "9"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Singapore"
            elif (country == 'Thailand'):
                item[
                    'country_code'] = "10"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Thailand"
            elif (country == 'Vietnam'):
                item[
                    'country_code'] = "11"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Vietnam"
            #print("country_code,country_name"+item['country_code']+item['country_name'])

            item['source'] = "http://www.ics-shipping.org"
            # image="".join(response.xpath('//section[@id="block-views-newsroom-page-block-1"]').extract())
            # if image:
            #     item['image']=image
            # else:
            #     item['image']=None

            item['content'] = "".join(
                response.xpath(
                    '//*[@id="MainContent_T2E4BE915010_Col00"]/div/div[2]').
                extract())
            #print("content"+item['content'])

            item['title'] = "".join(
                response.xpath('//title/text()').extract()
            )  #"".join(response.xpath('//div[@class="sf_colsOut contentIntroInternal"]/'
            # 'div[@class="sf_colsIn sf_1col_1in_100"]/h1/text()').extract())
            item['time'] = "".join(
                response.xpath(
                    '//*[@id="MainContent_T2E4BE915010_Col00"]/div/div[1]/text()'
                ).extract())
            item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                 time.localtime(time.time()))
            item['image_urls'] = None

            # print("title:" + item['title'])#print(item['title'])
            # print("time:" + item['time'])
            yield item
        except Exception as error:
            log(error)
Exemplo n.º 15
0
    def parse_news(self, response):
        try:
            print("parse:" + response.url)
            country = response.meta["country"]
            print(country)
            #print(response.body)
            item = NewsItem()
            item['url'] = response.url
            #
            if (country == 'China'):
                item[
                    'country_code'] = "1"  #"".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "China"
            elif (country == 'Brunei'):
                item[
                    'country_code'] = "2"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Brunei"
            elif (country == 'Cambodia'):
                item[
                    'country_code'] = "3"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Cambodia"
            elif (country == 'Indonesia'):
                item[
                    'country_code'] = "4"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Indonesia"
            elif (country == 'Lao'):
                item[
                    'country_code'] = "5"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Lao"
            elif (country == 'Malaysia'):
                item[
                    'country_code'] = "6"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Malaysia"
            elif (country == 'Myanmar'):
                item[
                    'country_code'] = "7"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Myanmar"
            elif (country == 'Philippine'):
                item[
                    'country_code'] = "8"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Philippine"
            elif (country == 'Singapore'):
                item[
                    'country_code'] = "9"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Singapore"
            elif (country == 'Thailand'):
                item[
                    'country_code'] = "10"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Thailand"
            elif (country == 'Vietnam'):
                item[
                    'country_code'] = "11"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Vietnam"
            #print("country_code,country_name"+item['country_code']+item['country_name'])

            item['source'] = "https://www.worldcargonews.com"

            content = "".join(
                response.xpath('//div[@class="ao-Article ao-CrIb"]').extract())
            content.replace(
                '<img src="/AcuCustom/Sitename/Icon/Icons/wcnIconLinkedIn.svg" alt="Linked In" title="Linked In" class="aos-DS5-Image aos-FL aos-MW100">',
                " ")
            content.replace(
                '<img src="/AcuCustom/Sitename/Icon/Icons/wcnIconTwitter.svg" alt="Twitter" title="Twitter" class="aos-DS5-Image aos-FL aos-MW100">',
                ' ')
            content.replace(
                '<img src="/AcuCustom/Sitename/Icon/Icons/wcnIconFB.svg" alt="Facebook" title="Facebook" class="aos-DS5-Image aos-FL aos-MW100">',
                '')
            content.replace(
                '<img src="/AcuCustom/Sitename/Icon/Icons/wcnIconGoogleP.svg" alt="Google Plus" title="Google Plus" class="aos-DS5-Image aos-FL aos-MW100">',
                '')
            content.replace(
                '<img src="/AcuCustom/Sitename/DAM/002/HHMC_STS_Delivered_to_Port_Louis.jpg" alt="HHMC STS cranes in Mauritius" title="HHMC STS cranes in Mauritius" class="aos-DS5-Image aos-W100">',
                '')

            item['content'] = content
            # print("content"+item['content'])

            item['title'] = "".join(
                response.xpath(
                    '//div[@class="ao-Article ao-CrIb"]/h1/text()').extract()
            )  #"".join(response.xpath('//div[@class="sf_colsOut contentIntroInternal"]/'
            # 'div[@class="sf_colsIn sf_1col_1in_100"]/h1/text()').extract())
            item['time'] = "".join(
                response.xpath(
                    '//span[@class="aos-ArticleDate aos-MRXS4 aos-NM aos-FL aos-DF"]/text()'
                ).extract())
            item['crawled_time'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                 time.localtime(time.time()))

            item['image_urls'] = None

            yield item
        except Exception as error:
            log(error)
Exemplo n.º 16
0
    def parse_news(self, response):
        try:
            print("parse:" + response.url)
            country = response.meta["country"]
            #print(country)
            # print(response.body)
            item = NewsItem()
            item['url'] = response.url
            #
            if (country == 'China'):
                item['country_code'] = "1"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "China"
            elif (country == 'Brunei'):
                item['country_code'] = "2"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Brunei"
            elif (country == 'Cambodia'):
                item['country_code'] = "3"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Cambodia"
            elif (country == 'Indonesia'):
                item['country_code'] = "4"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Indonesia"
            elif (country == 'Lao'):
                item['country_code'] = "5"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Lao"
            elif (country == 'Malaysia'):
                item['country_code'] = "6"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Malaysia"
            elif (country == 'Myanmar'):
                item['country_code'] = "7"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Myanmar"
            elif (country == 'Philippine'):
                item['country_code'] = "8"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Philippine"
            elif (country == 'Singapore'):
                item['country_code'] = "9"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Singapore"
            elif (country == 'Thailand'):
                item['country_code'] = "10"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Thailand"
            elif (country == 'Vietnam'):
                item['country_code'] = "11"  # "".join(response.xpath('//*[@property="v:summary"]/text()').extract())
                item['country_name'] = "Vietnam"
            # print("country_code,country_name" + item['country_code'] + item['country_name'])

            item['source']="http://www.imo.org"
            # image="".join(response.xpath('//section[@id="block-views-newsroom-page-block-1"]').extract())
            # if image:
            #     item['image']=image
            # else:
            #     item['image']=None
            item['image_urls'] = None
            content = str("".join(response.xpath('//*[@id="imo-pageLayout"]').extract()).replace("img "," "))
            # content.replace("img "," ")
            item['content']=content
            #print("content"+item['content'])

            item['title']="".join(response.xpath('//*[@id="imo-pageLayout"]/h1/text()').extract())
            item['time']=None#"".join(response.xpath('//*[@id="post-9384"]/div[2]/div/span[3]/text()').extract())
            item['crawled_time']=time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))
            # print("title:" + item['title'])#print(item['title'])
            # print("time:" + item['time'])
            yield item
        except Exception as error:
            log(error)