def parse_each_pages(self, response): page_no = response.meta['page_no'] last_page_no = response.meta['last_page_no'] last = response.xpath('//*[@id="mainIndexTable"]/tbody/tr[2]/td[2]/nobr/text()').get() if page_no == last_page_no: category_last_no = int(last) else: first = response.xpath('//*[@id="mainIndexTable"]/tbody/tr[32]/td[2]/nobr/text()').get() category_last_no = int(last) - int(first) + 1 category_no = 1 while True: if(category_no > category_last_no): break item = CrawlnkdbItem() post_title = response.xpath( '//*[@id="mainIndexTable"]/tbody/tr[' + str(2 * category_no) + ']/td[4]/a').xpath('string()').get() post_writer = response.xpath('//*[@id="mainIndexTable"]/tbody/tr[' + str(2*category_no) + ']/td[6]').xpath('string()').get() post_date = response.xpath('//*[@id="mainIndexTable"]/tbody/tr[' + str(2*category_no) + ']/td[7]').xpath('string()').get() item[config['VARS']['VAR1']] = post_title item[config['VARS']['VAR3']] = post_writer item[config['VARS']['VAR4']] = post_date print("###post_writer >>> ", post_writer) print("###post_date >>> ", post_date) category_link = response.xpath('//*[@id="mainIndexTable"]/tbody/tr[' + str(2*category_no) + ']/td[4]/a/@href').get() # print(category_link) url = 'http://www.nkis.kr/' + category_link print("###url >>> ", url) yield scrapy.Request(url, callback=self.parse_category, meta={'item':item}) category_no += 1
def parse_each_pages(self, response): page_no = response.meta['page_no'] last_page_no = response.meta['last_page_no'] last = response.meta['last'] first = response.meta['first'] if page_no == last_page_no: last = response.xpath( '//*[@id="container"]/div[1]/div[2]/div[2]/div/div[4]/table/tbody/tr[1]/th/text()' ).get() category_last_no = int(last) else: category_last_no = int(last) - int(first) + 1 category_no = 1 while True: if (category_no > category_last_no): break item = CrawlnkdbItem() number = response.xpath( '//*[@id="container"]/div[1]/div[2]/div[2]/div/div[4]/table/tbody/tr[' + str(category_no) + ']/th').get() #print(number) title = response.xpath( '//*[@id="container"]/div[1]/div[2]/div[2]/div/div[4]/table/tbody/tr[' + str(category_no) + ']/td[1]/p/text()').get() # title = title.split(")", maxsplit=1) # title = title[1] # title = title.strip() # print(title) writer = "관리자" body = "" date = response.xpath( '//*[@id="container"]/div[1]/div[2]/div[2]/div/div[4]/table/tbody/tr[' + str(category_no) + ']/td[1]/span[1]').xpath('string()').get() ### modify top_category = response.xpath( '//*[@id="container"]/div[1]/div[2]/div[2]/div/div[2]/ul/li[4]/a' ).xpath('string()').get() item[config['VARS']['VAR1']] = title item[config['VARS']['VAR3']] = writer item[config['VARS']['VAR2']] = body item[config['VARS']['VAR4']] = date item[config['VARS']['VAR5']] = "남북하나재단" item[config['VARS']['VAR6']] = "https://www.koreahana.or.kr/" item[config['VARS']['VAR7']] = top_category file_name = title download_url = response.xpath( '//*[@id="container"]/div[1]/div[2]/div[2]/div/div[4]/table/tbody/tr[' + str(category_no) + ']/td[2]/button/@onclick').get() download_url = download_url.split("'") # print(download_url) file_download_url = 'https://www.koreahana.or.kr' + download_url[1] print(file_download_url) item[config['VARS']['VAR10']] = file_download_url item[config['VARS']['VAR9']] = file_name category_no += 1 yield scrapy.Request(file_download_url, callback=self.save_file, meta={'item': item})
def parse_each_pages(self, response): page_no = response.meta['page_no'] last_page_no = response.meta['last_page_no'] print("###pageno: ", page_no) last = response.xpath( '//*[@id="cmsContent"]/div[2]/table/tbody/tr[1]/td[1]/text()').get( ) if page_no == last_page_no: category_last_no = int(last) else: first = response.xpath( '//*[@id="cmsContent"]/div[2]/table/tbody/tr[10]/td[1]/text()' ).get() category_last_no = int(last) - int(first) + 1 category_no = 1 while True: if (category_no > category_last_no): break category_link = response.xpath( '//*[@id="cmsContent"]/div[2]/table/tbody/tr[' + str(category_no) + ']/td[2]/a/@href').get() url = 'http://www.kinu.or.kr/www/jsp/prg/api/' + category_link # print(url) # number = response.xpath('//*[@id="boardActionFrm"]/div[2]/table/tbody/tr['+str(category_no)+']/td[1]').get() # print(number) item = CrawlnkdbItem() date = response.xpath( '//*[@id="cmsContent"]/div[2]/table/tbody/tr[' + str(category_no) + ']/td[3]').xpath('string()').get() item[config['VARS']['VAR4']] = date yield scrapy.Request(url, callback=self.parse_post, meta={'item': item}) category_no += 1
def parse_each_pages(self, response): page_no = response.meta['page_no'] last_page_no = response.meta['last_page_no'] last = response.xpath('//*[@id="div_article_contents"]/tr[1]/td[1]/text()').get() if page_no == last_page_no: first = 1 else: first = response.xpath('//*[@id="div_article_contents"]/tr[29]/td[1]/text()').get() category_last_no = int(last) - int(first)+1 category_no = 1 while 1: # 해당 url을 item에 넣어준다. if(category_no > category_last_no): break category_link = response.xpath('//*[@id="div_article_contents"]/tr[' + str(2*category_no-1) + ']/td[2]/font/a/@href').get() category_link = category_link.replace("./", "") url = "http://www.nkorea.or.kr/board/" + category_link # print(url) date = response.xpath('//*[@id="div_article_contents"]/tr['+ str(2*category_no-1) +']/td[5]/text()').extract() writer = response.xpath('//*[@id="div_article_contents"]/tr['+ str(2*category_no-1) +']/td[3]/text()').extract() # item 객체생성 item = CrawlnkdbItem() item["post_date"] = date item["post_writer"] = writer # item url에 할당 yield scrapy.Request(url, callback=self.parse_category, meta={'item':item}) category_no += 1
def parse_post(self, response): title = response.xpath('//*[@id="container"]/div/section/div[1]/div/table/tbody/tr[1]/td').xpath('string()').get() body = " " writer = response.xpath('//*[@id="container"]/div/section/div[1]/div/table/tbody/tr[4]/td').xpath('string()').get() date = response.xpath('//*[@id="container"]/div/section/div[1]/div/table/tbody/tr[6]/td').xpath('string()').get() top_category = response.xpath('//*[@id="container"]/div/nav/ul/li[2]/ul/li[5]/a').xpath('string()').get() print(top_category) item = CrawlnkdbItem() item[config['VARS']['VAR1']] = title item[config['VARS']['VAR2']] = body item[config['VARS']['VAR3']] = writer item[config['VARS']['VAR4']] = date item[config['VARS']['VAR7']] = top_category item[config['VARS']['VAR5']] = "통일부" item[config['VARS']['VAR6']] = "https://unibook.unikorea.go.kr/" file_name = response.xpath('//*[@id="container"]/div/section/div[1]/div/table/tbody/tr[3]/td/a').xpath('string()').get() file_download_url = response.xpath('//*[@id="container"]/div/section/div[1]/div/table/tbody/tr[3]/td/a/@href').get() file_download_url = "https://unibook.unikorea.go.kr" + file_download_url print(file_download_url) item[config['VARS']['VAR10']] = file_download_url item[config['VARS']['VAR9']] = file_name print("@@@@@@file name ", file_name) if file_download_url: if file_download_url.find("hwp") != -1 : print('find hwp') yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item':item}) # else: yield scrapy.Request(file_download_url, callback=self.save_file, meta={'item':item}) else: #print("###############file does not exist#################") yield item
def parse_post(self, response): item = CrawlnkdbItem() #title = response.css('#main > table > thead > tr > th font::text').get() title = response.xpath('//*[@id="s_mid21_wrap0"]/div/div[1]/div[1]/h1/a/span/text()').get() if title is None: title = response.xpath('//*[@id="s_mid21_wrap0"]/div/div[1]/div[1]/h1/a/text()').get() #table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract() # body = response.css('.descArea')[0].get_text() body = response.xpath('// *[ @ id = "s_mid21_wrap0"] / div / div[1] / div[2]').get() #body = re.search('<body.*/body>', body, re.I | re.S) body = re.sub('<script.*?>.*?</script>', '', body, 0, re.I | re.S) body = re.sub('<.+?>', '', body, 0, re.I | re.S) body = re.sub(' | |\t|\r|\n', " ", body) body = re.sub('\"', "'", body) print(body) #body = response.css('.descArea').xpath('string()').extract() date = response.xpath('//*[@id="s_mid21_wrap0"]/div/div[1]/div[1]/p[1]/text()').get() writer = response.xpath('//*[@id="s_mid21_wrap0"]/div/div[1]/div[1]/p[2]/a/text()').get() body_text = ''.join(body) top_category = "군사주의와 여성" item['post_title'] = title.strip() item['post_date'] = date.strip() item['post_writer'] = writer.strip() item['post_body'] = body_text.strip() item['published_institution'] = "평화를 만드는 여성회" item['published_institution_url'] = "http://www.peacewomen.or.kr/" item[config['VARS']['VAR7']] = top_category yield item file_name = title file_icon = response.xpath('//*[@id="s_mid21_wrap0"]/div/div[1]/div[3]/div[1]/ul/li/a/text()').get() file_icon = None if file_icon: file_download_url = response.xpath('//*[@id="s_mid21_wrap0"]/div/div[1]/div[3]/div[1]/ul/li/a/@href').extract() file_download_url = file_download_url[0] file_download_url = "http://www.peacewomen.or.kr/" + file_download_url item[config['VARS']['VAR10']] = file_download_url item[config['VARS']['VAR9']] = file_name print("@@@@@@file name ", file_name) if file_icon.find("hwp") != -1: print('find hwp') yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item': item}, dont_filter=True) # else: yield scrapy.Request(file_download_url, callback=self.save_file, meta={'item': item, 'file_download_url': file_download_url, 'file_name': file_icon}, dont_filter=True) else: print("###############file does not exist#################") yield item
def parse_each_pages(self, response): page_no = response.meta['page_no'] last_page_no = response.meta['last_page_no'] last = response.xpath( '//*[@id="contents"]/table/tbody/tr[1]/td[1]/text()').get() if page_no == last_page_no: first = 1 else: first = response.xpath( '//*[@id="contents"]/table/tbody/tr[20]/td[1]/text()').get() category_last_no = int(last) - int(first) + 1 category_no = 1 while 1: # 해당 url을 item에 넣어준다. if (category_no > category_last_no): break category_link = response.xpath( '//*[@id="contents"]/table/tbody/tr[' + str(category_no) + ']/td[2]/a/@href').get() category_no += 1 if str(type(category_link)) == "<class 'NoneType'>": continue # print(category_link) url = "http://nkd.or.kr" + category_link item = CrawlnkdbItem() yield scrapy.Request(url, callback=self.parse_category, meta={'item': item})
def parse_each_pages(self, response): user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' headers = {'User-Agent': user_agent} page_no = response.meta['page_no'] last_page_no = response.meta['last_page_no'] last = response.xpath('//*[@id="frm"]/div/table/tbody/tr[1]/td[1]/text()').get() if page_no == last_page_no: first = 1 else: first = response.xpath('//*[@id="frm"]/div/table/tbody/tr[10]/td[1]/text()').get() category_last_no = int(last) - int(first)+1 category_no = 1 while 1: # 해당 url을 item에 넣어준다. if(category_no > category_last_no): break category_link = response.xpath('//*[@id="frm"]/div/table/tbody/tr[' + str(category_no) + ']/td[2]/a/@href').get() url = "http://www.kolofo.org" + category_link # item 객체생성 item = CrawlnkdbItem() # item url에 할당 yield scrapy.Request(url, headers = headers, callback=self.parse_category, meta={'item':item}) category_no += 1
def parse_each_pages(self, response): page_no = response.meta['page_no'] last_page_no = response.meta['last_page_no'] last = response.xpath( '//*[@id="smain_all"]/table[2]/tbody/tr[1]/td[1]/div/font/text()' ).get() if page_no == last_page_no: category_last_no = int(last) else: first = response.xpath( '//*[@id="smain_all"]/table[2]/tbody/tr[5]/td[1]/div/font/text()' ).get() category_last_no = int(last) - int(first) + 1 category_no = 1 while True: if (category_no > category_last_no): break category_link = '//*[@id="smain_all"]/table[2]/tbody/tr[' + str( category_no) + ']/td[3]/div/a/@onclick' onclick_text = response.xpath(category_link).extract() url = re.findall("\d+", str(onclick_text)) url = 'http://www.nuac.go.kr/actions/BbsDataAction?cmd=view&menuid=G' + url[ 1] + '&bbs_id=G' + url[1] + '&bbs_idx=' + url[ 0] + '&parent_idx=&_template=03&_max=05&_page=' + str( page_no) + '&head=' item = CrawlnkdbItem() # yield scrapy.Request(url, callback=self.parse_post, meta={'item': item}) category_no += 1
def parse_post(self, response): item = CrawlnkdbItem() title = response.xpath('//*[@id="mci_entrep"]/table/tbody/tr[1]/td').xpath('string()').get() body = response.xpath('//*[@id="mci_entrep"]/table/tbody/tr[5]').xpath('string()').get() writer = response.xpath('//*[@id="mci_entrep"]/table/tbody/tr[2]/td[1]').xpath('string()').get() date = response.xpath('//*[@id="mci_entrep"]/table/tbody/tr[2]/td[2]').xpath('string()').get() #######modify top_category = response.xpath('//*[@id="rep_tab_btn02"]/a').xpath('string()').get() item[config['VARS']['VAR1']] = title item[config['VARS']['VAR3']] = writer item[config['VARS']['VAR2']] = body item[config['VARS']['VAR4']] = date item[config['VARS']['VAR5']] = "평화재단" item[config['VARS']['VAR6']] = "http://www.pf.or.kr/wpages/01-3_research_1.php" item[config['VARS']['VAR7']] = top_category file_name = response.xpath('//*[@id="mci_entrep"]/table/tbody/tr[4]/td/a').xpath('string()').get() print("###file_name: ", file_name) if file_name is not None: file_download_pre = response.xpath('//*[@id="mci_entrep"]/table/tbody/tr[4]/td/a/@href').get() file_download_url = "http://www.pf.or.kr" + file_download_pre item[config['VARS']['VAR10']] = file_download_url item[config['VARS']['VAR9']] = file_name item[config['VARS']['VAR12']] = body print("@@@@@@file name ", file_name) yield item
def parse_each_pages(self, response): page_no = response.meta['page_no'] last_page_no = response.meta['last_page_no'] last = response.xpath( '//*[@id="frm"]/div/table/tbody/tr[1]/td[1]/text()').get() if page_no == last_page_no: first = 1 else: first = response.xpath( '//*[@id="frm"]/div/table/tbody/tr[10]/td[1]/text()').get() category_last_no = int(last) - int(first) + 1 category_no = 1 while 1: # 해당 url을 item에 넣어준다. if (category_no > category_last_no): break category_link = response.xpath( '//*[@id="frm"]/div/table/tbody/tr[' + str(category_no) + ']/td[2]/a/@href').get() url = "http://www.kolofo.org" + category_link # item 객체생성 item = CrawlnkdbItem() # item url에 할당 yield scrapy.Request(url, headers=self.headers, callback=self.parse_category, meta={'item': item}) category_no += 1
def parse_each_pages(self, response): link = response.meta['link'] print("###link: ", link) page_no = response.meta['page_no'] last_page_no = response.meta['last_page_no'] category_num = response.meta['category_num'] print("###pageno: ", page_no) if page_no == last_page_no: page_total_num = response.xpath( '//*[@id="container"]/div/section/div[1]/div[1]/header/h5' ).xpath('string()').get() page_total_num = re.findall("\d,\d+", str(page_total_num)) page_total_num = str(page_total_num[0]) page_total_num = page_total_num.replace(",", "") page_total_num = int(page_total_num) category_last_no = (last_page_no * category_num) - int(page_total_num) print(category_last_no) else: category_last_no = category_num category_no = 1 while True: if (category_no > category_last_no): break title = response.xpath('//*[@id="sublist"]/ul[' + str(category_no) + ']/li[2]/h6/a').xpath('string()').get() print(title) body = " " writer = response.xpath( '//*[@id="sublist"]/ul[' + str(category_no) + ']/li[2]/div/dl[1]/dd').xpath('string()').get() date = response.xpath('//*[@id="sublist"]/ul[' + str(category_no) + ']/li[2]/div/dl[3]/dd').xpath( 'string()').get() item = CrawlnkdbItem() item[config['VARS']['VAR1']] = title item[config['VARS']['VAR2']] = body item[config['VARS']['VAR3']] = writer item[config['VARS']['VAR8']] = date item[config['VARS']['VAR7']] = "통일부 발간물" item[config['VARS']['VAR5']] = "통일부" item[config['VARS']['VAR6']] = "https://unibook.unikorea.go.kr/" item[config['VARS']['VAR9']] = title crawl_url = response.xpath('//*[@id="sublist"]/ul[' + str(category_no) + ']/li[2]/h6/a/@href').get() url = "https://unibook.unikorea.go.kr/material/" + crawl_url category_no += 1 print("#############category_url", url) yield scrapy.Request(url, callback=self.parse_post, meta={'item': item})
def parse_post(self, response): item = CrawlnkdbItem() # title = response.css('#main > table > thead > tr > th font::text').get() title = response.xpath('//*[@id="jwxe_main_content"]/div/div/div[1]/div/p[1]/text()').get() # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract() # body = response.css('.descArea')[0].get_text() body = response.xpath('//*[@id="jwxe_main_content"]/div/div/div[1]/div/div[1]/pre/text()').get() #print(body) date = response.xpath('//*[@id="jwxe_main_content"]/div/div/div[1]/div/p[2]').get() date = re.sub('<script.*?>.*?</script>', '', date, 0, re.I | re.S) date = re.sub('<.+?>', '', date, 0, re.I | re.S) date = re.sub(' | |\t|\r|\n', " ", date) date = re.sub('\"', "'", date) date = date.split('|') writer = date[0] date = date[1] #writer = response.xpath('//*[@id="jwxe_main_content"]/div/div/div[1]/div/p[2]/text()[1]/text()').get() body_text = ''.join(body) top_category = response.xpath('//*[@id="sub_middle"]/div[2]/div[1]/div[1]/h3/text()').get() item[config['VARS']['VAR1']] = title.strip() item[config['VARS']['VAR4']] = date.strip() item[config['VARS']['VAR3']] = writer.strip() item[config['VARS']['VAR2']] = body_text.strip() item[config['VARS']['VAR5']] = "국회 외교통일위원회" item[config['VARS']['VAR6']] = "https://uft.na.go.kr" item[config['VARS']['VAR7']] = top_category file_name = title file_icon = response.xpath('//*[@id="jwxe_main_content"]/div/div/div[1]/div/div[2]/p[3]/a[1]/text()').get() if file_icon: file_download_url = response.xpath('//*[@id="jwxe_main_content"]/div/div/div[1]/div/div[2]/p[3]/a[1]/@href').extract() file_download_url = file_download_url[0] file_download_url = "https://uft.na.go.kr:444/uft/reference/reference03.do" + file_download_url #print(file_download_url) item[config['VARS']['VAR10']] = file_download_url item[config['VARS']['VAR9']] = file_name print("@@@@@@file name ", file_name) if file_icon.find("hwp") != -1: print('find hwp') yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item': item}) # else: yield scrapy.Request(file_download_url, callback=self.save_file, meta={'item': item, 'file_download_url': file_download_url, 'file_name': file_icon}, dont_filter=True) else: print("###############file does not exist#################") yield item
def parse_post(self, response): item = CrawlnkdbItem() category_no = response.meta['category_no'] # title = response.css('#main > table > thead > tr > th font::text').get() title = response.xpath('//*[@id="cmsContent"]/div[3]/table/tbody/tr[' + str(category_no) + ']/td[2]/text()').get() print(title) # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract() # body = response.css('.descArea')[0].get_text() body = response.xpath('//*[@id="cmsContent"]/div[1]/div/div[2]/ul/li/text()').get() # body = "북한당국이 발간한 최초의 종합 통계집인 동시에 현재까지 입수 가능한 거의 유일한 북한 공식 통계집이다. 1945년 이후 1960년대 초까지 북한당국은 주기적으로 공식 통계를 발표해 왔는데, 동 통계집은 이렇게 발표된 통계를 '자연조건' 및 '행정구역'에서부터 '교육', '보건'에 이르기까지 각 항목별로 체계적으로 정리하고 있다. 이들 통계는 북한의 공식통계가 매우 희소한 오늘날의 북한을 이해하기 위해서도 매우 귀중한 자료라 할 수 있다." # body = response.css('.descArea').xpath('string()').extract() #date = response.xpath('//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[2]/td[1]/text()').get() date = "1940-60" #print(date) #writer = response.xpath('//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[1]/td/text()').get() writer = "북한당국" #print(writer) body_text = ''.join(body) top_category = response.xpath('//*[@id="container"]/div[3]/div[1]/div/h2/text()').get() item['post_title'] = title.strip() item['post_date'] = date.strip() item['post_writer'] = writer.strip() item['post_body'] = body_text.strip() item['published_institution'] = "통일연구원" item['published_institution_url'] = "http://www.kinu.or.kr/www/jsp/prg/" item[config['VARS']['VAR7']] = top_category file_name = title file_icon = response.xpath('//*[@id="cmsContent"]/div[3]/table/tbody/tr[1]/td[3]/a[2]/img').get() if file_icon: file_download_url = response.xpath('//*[@id="cmsContent"]/div[3]/table/tbody/tr[1]/td[3]/a[2]/@href').extract() file_download_url = file_download_url[0] file_download_url = "http://www.kinu.or.kr/" + file_download_url item[config['VARS']['VAR10']] = file_download_url item[config['VARS']['VAR9']] = file_name print("@@@@@@file name ", file_name) if file_icon.find("hwp") != -1: print('find hwp') yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item': item}) # else: yield scrapy.Request(file_download_url, callback=self.save_file, meta={'item': item, 'file_download_url': file_download_url, 'file_name': file_icon}, dont_filter=True) else: print("###############file does not exist#################") yield item
def parse_post(self, response): item = CrawlnkdbItem() category_no = response.meta['category_no'] category_no = int(category_no) title = response.xpath('//*[@id="content"]/div[2]/ul/li[' + str(category_no) + ']/div/div/h3/text()').get() print(title) # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract() # body = response.css('.descArea')[0].get_text() body = "UNIFICATION WHITE PAPER" # body = response.css('.descArea').xpath('string()').extract() test = title.split('년') date = str(test[0]) print(date) writer = "통일부" #print(writer) body_text = ''.join(body) top_category = "통일백서" item[config['VARS']['VAR1']] = title.strip() item[config['VARS']['VAR4']] = date.strip() item[config['VARS']['VAR3']] = writer.strip() item[config['VARS']['VAR2']] = body_text.strip() item[config['VARS']['VAR5']] = "통일부" item[config['VARS']['VAR6']] = "https://www.unikorea.go.kr" item[config['VARS']['VAR7']] = top_category file_name = title # file_icon = response.xpath('//*[@id="content"]/div[2]/ul/li[' + str(category_no) +']/div/div/div/a[2]/@href').extract() file_icon = title # file_icon을 찾을 수 없어서 # file_icon = None if file_icon: file_download_url = response.xpath( '//*[@id="content"]/div[2]/ul/li[' + str(category_no) + ']/div/div/div/a[2]/@href').extract() file_download_url = file_download_url[0] file_download_url = "https://www.unikorea.go.kr" + file_download_url item[config['VARS']['VAR10']] = file_download_url item[config['VARS']['VAR9']] = file_name print("@@@@@@file name ", file_name) if file_icon.find("hwp") != -1: print('find hwp') yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item': item}) # else: yield scrapy.Request(file_download_url, callback=self.save_file, meta={'item': item, 'file_download_url': file_download_url, 'file_name': file_icon}, dont_filter=True) else: print("###############file does not exist#################") yield item
def parse_post(self, response): item = CrawlnkdbItem() category_no = response.meta['category_no'] # title = response.css('#main > table > thead > tr > th font::text').get() title = response.xpath('//*[@id="cmsContent"]/div[3]/table/tbody/tr[' + str(category_no) + ']/td[2]/text()').get() print(title) # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract() # body = response.css('.descArea')[0].get_text() #body = response.xpath('//*[@id="tab_con"]').get() body = "2002년 북한당국은 UN 경제사회 이사회 등에 자국의 인권상황과 관련된 공식의견을 표명하면서 이를 뒷받침하기 위한 수단의 하나로 당시까지의 주요 공식 통계를 제출하였다. 이들 통계는 현재까지 얻을 수 있는 가장 최근의 북한 통계들로서 1990년대 이후 북한의 모습을 반영하고 있다. 이하에 수록된 통계 자료들은 이렇게 제출된 북한 통계들 가운데 중요한 것들을 취합한 것이다." # body = response.css('.descArea').xpath('string()').extract() #date = response.xpath('//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[2]/td[1]/text()').get() date = "2002년" #print(date) #writer = response.xpath('//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[1]/td/text()').get() writer = "북한당국" #print(writer) body_text = ''.join(body) top_category = response.xpath('//*[@id="container"]/div[3]/div[1]/div/h2/text()').get() item['post_title'] = title.strip() item['post_date'] = date.strip() item['post_writer'] = writer.strip() item['post_body'] = body_text.strip() item['published_institution'] = "통일연구원" item['published_institution_url'] = "http://www.kinu.or.kr/www/jsp/prg/" item[config['VARS']['VAR7']] = top_category file_name = title file_icon = response.xpath('//*[@id="cmsContent"]/div[3]/table/tbody/tr[1]/td[3]/a[2]/img').get() if file_icon: file_download_url = response.xpath('//*[@id="cmsContent"]/div[3]/table/tbody/tr[1]/td[3]/a[2]/@href').extract() file_download_url = file_download_url[0] file_download_url = "http://www.kinu.or.kr/" + file_download_url item[config['VARS']['VAR10']] = file_download_url item[config['VARS']['VAR9']] = file_name print("@@@@@@file name ", file_name) if file_icon.find("hwp") != -1: print('find hwp') yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item': item}) # else: yield scrapy.Request(file_download_url, callback=self.save_file, meta={'item': item, 'file_download_url': file_download_url, 'file_name': file_icon}, dont_filter=True) else: print("###############file does not exist#################") yield item
def parse_each_pages(self, response): link = response.meta['link'] print("###link: ", link) page_no = response.meta['page_no'] print("###pageno: ", page_no) last_page_no = response.meta['last_page_no'] last_page_category_num = response.meta['last_page_category_num'] if page_no == last_page_no: category_num = last_page_category_num else: category_num = 10 category_no = 1 while True: if (category_no > category_num): break title = response.xpath('//*[@id="sub_reports"]/ul[' + str(category_no) + ']/a/li').xpath('string()').get() print(title) writer = response.xpath('//*[@id="sub_reports"]/ul[' + str(category_no) + ']/li[3]').xpath('string()').get() writer = writer.replace("By : ", "") writer = writer.strip() date = response.xpath('//*[@id="sub_reports"]/ul[' + str(category_no) + ']/li[1]/span[1]').xpath('string()').get() date = date.replace("DATE : ", "") date = date.strip() item = CrawlnkdbItem() item[config['VARS']['VAR1']] = title item[config['VARS']['VAR3']] = writer item[config['VARS']['VAR4']] = date ### modify item[config['VARS']['VAR7']] = response.xpath( '//*[@id="left_menu"]/li[4]/a').xpath('string()').get() item[config['VARS']['VAR5']] = "제주평화연구원" item[config['VARS'] ['VAR6']] = "http://www.jpi.or.kr/kor/issue/reports.sky" item[config['VARS']['VAR9']] = title crawl_url = response.xpath('//*[@id="sub_reports"]/ul[' + str(category_no) + ']/a/@href').get() url = "http://www.jpi.or.kr" + crawl_url category_no += 1 print("#############category_url", url) yield scrapy.Request(url, callback=self.parse_post, meta={'item': item})
def parse_post(self, response): item = CrawlnkdbItem() # title = response.css('#main > table > thead > tr > th font::text').get() title = response.xpath('//*[@id="cmsContent"]/div[1]/p/text()').get() print(title) # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract() # body = response.css('.descArea')[0].get_text() body = "no text" # body = response.css('.descArea').xpath('string()').extract() date = response.xpath('//*[@id="cmsContent"]/div[2]/table/thead/tr[2]/td/text()').get() #print(date) writer = "KINU" #print(writer) body_text = ''.join(body) top_category = response.xpath('//*[@id="container"]/div[3]/div[1]/div/h2/text()').get() item['post_title'] = title.strip() item['post_date'] = date.strip() item['post_writer'] = writer.strip() item['post_body'] = body_text.strip() item['published_institution'] = "통일연구원" item['published_institution_url'] = "http://www.kinu.or.kr/www/jsp/prg/" item[config['VARS']['VAR7']] = top_category file_name = title file_icon = response.xpath('//*[@id="cmsContent"]/div[2]/table/thead/tr[4]/td/a/img').get() if file_icon: file_download_url = response.xpath('//*[@id="cmsContent"]/div[2]/table/thead/tr[4]/td/a/@href').extract() file_download_url = file_download_url[0] item[config['VARS']['VAR10']] = file_download_url item[config['VARS']['VAR9']] = file_name print("@@@@@@file name ", file_name) if file_icon.find("hwp") != -1: print('find hwp') yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item': item}) # else: yield scrapy.Request(file_download_url, callback=self.save_file, meta={'item': item, 'file_download_url': file_download_url, 'file_name': file_icon}, dont_filter=True) else: print("###############file does not exist#################") yield item
def parse_each_pages(self, response): page_no = response.meta['page_no'] last_page_no = response.meta['last_page_no'] ### modify last = response.xpath('//*[@id="main"]/div/div[2]/div/div[4]/table[2]/tbody/tr[1]/td[1]/div/font/text()').get() if page_no == last_page_no: category_last_no = int(last) else: ### modify first = response.xpath('//*[@id="main"]/div/div[2]/div/div[4]/table[2]/tbody/tr[5]/td[1]/div/font/text()').get() category_last_no = int(last) - int(first) + 1 category_no = 1 while True: if(category_no > category_last_no): break item = CrawlnkdbItem() ### modify title = response.xpath('//*[@id="main"]/div/div[2]/div/div[4]/table[2]/tbody/tr['+str(category_no)+']/td[3]/div').xpath('string()').get() body = " " writer = "관리자" date = response.xpath('//*[@id="main"]/div/div[2]/div/div[4]/table[2]/tbody/tr['+str(category_no)+']/td[6]/div/font').xpath('string()').get() top_category = response.xpath('//*[@id="main"]/div/div[1]/div[1]/a').xpath('string()').get() item[config['VARS']['VAR1']] = title item[config['VARS']['VAR2']] = body item[config['VARS']['VAR3']] = writer item[config['VARS']['VAR4']] = date item[config['VARS']['VAR5']] = "민주평화통일자문회의" item[config['VARS']['VAR6']] = "http://www.nuac.go.kr/actions/" item[config['VARS']['VAR7']] = top_category ### modify file_name = title file_download_url = response.xpath('//*[@id="main"]/div/div[2]/div/div[4]/table[2]/tbody/tr['+str(category_no)+']/td[5]/div/a/@href').get() category_no += 1 if file_download_url is not None: item[config['VARS']['VAR10']] = file_download_url item[config['VARS']['VAR9']] = file_name #print("@@@@@@file name ", file_name) if file_download_url.find("hwp") != -1 : #print('find hwp') yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item':item}) # else: yield scrapy.Request(file_download_url, callback=self.save_file, meta={'item':item}) else: print("###############file does not exist#################") yield item
def parse_each_pages(self, response): page_no = response.meta['page_no'] last_page_no = response.meta['last_page_no'] last = response.xpath('//*[@id="main"]/div/div[2]/div/div[4]/table[2]/tbody/tr[1]/td[1]/div/font/text()').get() if page_no == last_page_no: category_last_no = int(last) else: first = response.xpath('//*[@id="main"]/div/div[2]/div/div[4]/table[2]/tbody/tr[5]/td[1]/div/font/text()').get() category_last_no = int(last) - int(first) + 1 category_no = 1 while True: if(category_no > category_last_no): break url = response.xpath('//*[@id="main"]/div/div[2]/div/div[4]/table[2]/tbody/tr[' + str(category_no) + ']/td[3]/div/a/@href').get() item = CrawlnkdbItem() # yield scrapy.Request(url, callback=self.parse_post, meta={'item':item}) category_no += 1
def parse_post(self, response): item = CrawlnkdbItem() category_no = response.meta['category_no'] # title = response.css('#main > table > thead > tr > th font::text').get() title = response.xpath('//*[@id="cmsContent"]/div[3]/table/tbody/tr[' + str(category_no) + ']/td[2]/text()').get() print(title) # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract() # body = response.css('.descArea')[0].get_text() #body = response.xpath('//*[@id="tab_con"]').get() body = "1960년대 이후 1990년대 초까지 북한당국은 공식통계를 체계적으로 하지 않았다. 동 통계집은 이러한 북한통계의 공백기를 메우기 위해 한국 통일부가 북한의 각 문헌자료 속에 산재한 당시의 통계들을 하나로 모아 간행한 것이다. 여기에 수록된 통계들은 1990년대 이전까지의 북한과 관련된 거의 유일한 통계자료라고 할 수 있다." # body = response.css('.descArea').xpath('string()').extract() #date = response.xpath('//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[2]/td[1]/text()').get() date = "1960-90" #print(date) #writer = response.xpath('//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[1]/td/text()').get() writer = "통일부" #print(writer) body_text = ''.join(body) top_category = response.xpath( '//*[@id="container"]/div[3]/div[1]/div/h2/text()').get() item['post_title'] = title.strip() item['post_date'] = date.strip() item['post_writer'] = writer.strip() item['post_body'] = body_text.strip() item['published_institution'] = "통일연구원" item[ 'published_institution_url'] = "http://www.kinu.or.kr/www/jsp/prg/" item[config['VARS']['VAR7']] = top_category file_name = title file_icon = response.xpath( '//*[@id="cmsContent"]/div[3]/table/tbody/tr[1]/td[3]/a[2]/img' ).get() if file_icon: file_download_url = response.xpath( '//*[@id="cmsContent"]/div[3]/table/tbody/tr[1]/td[3]/a[2]/@href' ).extract() file_download_url = file_download_url[0] file_download_url = "http://www.kinu.or.kr/" + file_download_url item[config['VARS']['VAR10']] = file_download_url item[config['VARS']['VAR9']] = file_name print("@@@@@@file name ", file_name) if file_icon.find("hwp") != -1: print('find hwp') yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item': item}) # else: yield scrapy.Request(file_download_url, callback=self.save_file, meta={ 'item': item, 'file_download_url': file_download_url, 'file_name': file_icon }, dont_filter=True) else: print("###############file does not exist#################") yield item
def parse_post(self, response): item = CrawlnkdbItem() category_no = response.meta['category_no'] category_no = int(category_no) title = response.xpath( '//*[@id="bbsForm"]/div/article/div[1]/h3/text()').get() # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract() # body = response.css('.descArea')[0].get_text() body = response.xpath( '//*[@id="bbsForm"]/div/article/div[2]/div[1]/p/text()').get() # body = response.css('.descArea').xpath('string()').extract() date = response.xpath( '//*[@id="bbsForm"]/div/article/div[1]/div/dl[2]/dd/text()').get() writer = "통일부" #print(writer) body_text = ''.join(body) top_category = "북한동향" item[config['VARS']['VAR1']] = title.strip() item[config['VARS']['VAR4']] = date.strip() item[config['VARS']['VAR3']] = writer.strip() item[config['VARS']['VAR2']] = body_text.strip() item[config['VARS']['VAR5']] = "통일부" item[config['VARS']['VAR6']] = "https://www.unikorea.go.kr" item[config['VARS']['VAR7']] = top_category file_name = title file_icon = response.xpath( '//*[@id="bbsForm"]/div/article/div[2]/section/div[2]/ul/li[1]/a[1]/text()' ).get() # file_icon = None if file_icon: file_download_url = response.xpath( '//*[@id="bbsForm"]/div/article/div[2]/section/div[2]/ul/li[1]/a[1]/@href' ).extract() file_download_url = file_download_url[0] slice = file_download_url.split("javascript:Jnit_boardDownload(") slice = slice[1].split(";") slice = slice[0].split("'") file_download_url = "https://www.unikorea.go.kr/" + slice[1] item[config['VARS']['VAR10']] = file_download_url item[config['VARS']['VAR9']] = file_name print("@@@@@@file name ", file_name) if file_icon.find("hwp") != -1: print('find hwp') yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item': item}, dont_filter=True) # else: yield scrapy.Request(file_download_url, callback=self.save_file, meta={ 'item': item, 'file_download_url': file_download_url, 'file_name': file_icon }, dont_filter=True) else: print("###############file does not exist#################") yield item
def parse_post(self, response): item = CrawlnkdbItem() category_no = response.meta['category_no'] # title = response.css('#main > table > thead > tr > th font::text').get() title = response.xpath('//*[@id="cmsContent"]/div[3]/table/tbody/tr[' + str(category_no) + ']/td[2]/text()').get() print(title) # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract() # body = response.css('.descArea')[0].get_text() #body = response.xpath('//*[@id="tab_con"]').get() body = "1998년 북한은 국제적 식량 및 농업지원을 목적으로 UNDP와 공동으로 \"Thematic Round Table Meeting on Agricultural Recovery and Environmental Protection For the DPRK\"를 개최하였다. 동 회의를 위해 북한은 사상 최초로 자국의 美달러화 표시 GDP 규모를 밝히는 등 여러 중요한 통계자료를 제출하였다. 이하에 수록된 통계 자료들은 이렇게 제출된 북한의 공식통계를 취합한 것이다." # body = response.css('.descArea').xpath('string()').extract() #date = response.xpath('//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[2]/td[1]/text()').get() date = "1998" #print(date) #writer = response.xpath('//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[1]/td/text()').get() writer = "북한당국" #print(writer) body_text = ''.join(body) top_category = response.xpath( '//*[@id="container"]/div[3]/div[1]/div/h2/text()').get() item['post_title'] = title.strip() item['post_date'] = date.strip() item['post_writer'] = writer.strip() item['post_body'] = body_text.strip() item['published_institution'] = "통일연구원" item[ 'published_institution_url'] = "http://www.kinu.or.kr/www/jsp/prg/" item[config['VARS']['VAR7']] = top_category file_name = title file_icon = response.xpath( '//*[@id="cmsContent"]/div[3]/table/tbody/tr[1]/td[3]/a[2]/img' ).get() if file_icon: file_download_url = response.xpath( '//*[@id="cmsContent"]/div[3]/table/tbody/tr[1]/td[3]/a[2]/@href' ).extract() file_download_url = file_download_url[0] file_download_url = "http://www.kinu.or.kr/" + file_download_url item[config['VARS']['VAR10']] = file_download_url item[config['VARS']['VAR9']] = file_name print("@@@@@@file name ", file_name) if file_icon.find("hwp") != -1: print('find hwp') yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item': item}) # else: yield scrapy.Request(file_download_url, callback=self.save_file, meta={ 'item': item, 'file_download_url': file_download_url, 'file_name': file_icon }, dont_filter=True) else: print("###############file does not exist#################") yield item
def parse_post(self, response): item = CrawlnkdbItem() category_no = response.meta['category_no'] # title = response.css('#main > table > thead > tr > th font::text').get() title = response.xpath('//*[@id="cmsContent"]/div[3]/table/tbody/tr[' + str(category_no) + ']/td[2]/text()').get() print(title) # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract() # body = response.css('.descArea')[0].get_text() #body = response.xpath('//*[@id="tab_con"]').get() body = "1997년 이후 북한당국은 국제기구 등이 중심이 되어 실시한 북한 어린이 영양실태 조사에 적극적으로 협조해 왓다. 동 조사의 결과는 조사에 참여한 국제기구 등에 의해 발표되었을 뿐만 아니라 동시에 북한 조선중앙통계국의 명의로도 발표되었다. 이하에 수록된 통계 자료들은 이렇게 발표된 조사결과를 취합한 것이다." # body = response.css('.descArea').xpath('string()').extract() #date = response.xpath('//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[2]/td[1]/text()').get() date = "1997년 이후" #print(date) #writer = response.xpath('//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[1]/td/text()').get() writer = "북한당국" #print(writer) body_text = ''.join(body) top_category = response.xpath( '//*[@id="container"]/div[3]/div[1]/div/h2/text()').get() item['post_title'] = title.strip() item['post_date'] = date.strip() item['post_writer'] = writer.strip() item['post_body'] = body_text.strip() item['published_institution'] = "통일연구원" item[ 'published_institution_url'] = "http://www.kinu.or.kr/www/jsp/prg/" item[config['VARS']['VAR7']] = top_category file_name = title file_icon = response.xpath( '//*[@id="cmsContent"]/div[3]/table/tbody/tr[1]/td[3]/a[2]/img' ).get() if file_icon: file_download_url = response.xpath( '//*[@id="cmsContent"]/div[3]/table/tbody/tr[1]/td[3]/a[2]/@href' ).extract() file_download_url = file_download_url[0] file_download_url = "http://www.kinu.or.kr/" + file_download_url item[config['VARS']['VAR10']] = file_download_url item[config['VARS']['VAR9']] = file_name print("@@@@@@file name ", file_name) if file_icon.find("hwp") != -1: print('find hwp') yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item': item}) # else: yield scrapy.Request(file_download_url, callback=self.save_file, meta={ 'item': item, 'file_download_url': file_download_url, 'file_name': file_icon }, dont_filter=True) else: print("###############file does not exist#################") yield item
def parse_post(self, response): item = CrawlnkdbItem() # title = response.css('#main > table > thead > tr > th font::text').get() title = response.xpath( '//*[@id="content_section"]/div[2]/div[1]/h4/text()').get() #print(title) # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract() # body = response.css('.descArea')[0].get_text() body = response.xpath('//*[@id="content_section"]/div[2]/div[2]').get() if body is None: test1 = response.xpath( '//*[@id="content_section"]/div[2]/dl[1]').get() test1 = re.sub('<script.*?>.*?</script>', '', test1, 0, re.I | re.S) test1 = re.sub('<.+?>', '', test1, 0, re.I | re.S) test1 = re.sub(' | |\t|\r|\n', " ", test1) test1 = re.sub('\"', "'", test1) test2 = response.xpath( '//*[@id="content_section"]/div[2]/dl[2]').get() test2 = re.sub('<script.*?>.*?</script>', '', test2, 0, re.I | re.S) test2 = re.sub('<.+?>', '', test2, 0, re.I | re.S) test2 = re.sub(' | |\t|\r|\n', " ", test2) test2 = re.sub('\"', "'", test2) body = test1 + test2 else: body = re.sub('<script.*?>.*?</script>', '', body, 0, re.I | re.S) body = re.sub('<.+?>', '', body, 0, re.I | re.S) body = re.sub(' | |\t|\r|\n', " ", body) body = re.sub('\"', "'", body) if body is None: body = "No text" if body == '': body = "No text" #print(body) # body = response.css('.descArea').xpath('string()').extract() date = response.xpath( '//*[@id="content_section"]/div[2]/div/div[2]/p[1]/span/text()' ).get() #print(date) if date is None: date = "No date" writer = response.xpath( '//*[@id="content_section"]/div[2]/div/div[2]/p[2]/span/text()' ).get() #print(writer) if writer is None: writer = "No writer" body_text = ''.join(body) top_category = "도서/동영상자료" item[config['VARS']['VAR1']] = title.strip() item[config['VARS']['VAR4']] = date.strip() item[config['VARS']['VAR3']] = writer.strip() item[config['VARS']['VAR2']] = body_text.strip() item[config['VARS']['VAR5']] = "통일부" item[config['VARS']['VAR6']] = "https://www.uniedu.go.kr/" item[config['VARS']['VAR7']] = top_category file_name = title file_icon = response.xpath( '//*[@id="content_section"]/div[2]/div/div[3]/p[1]/a/text()').get( ) if not file_icon: file_icon = response.xpath( '//*[@id="content_section"]/div[2]/div[1]/div[2]/p/a/text()' ).get() print(file_icon) file_icon = None if file_icon: file_download_url = response.xpath( '//*[@id="content_section"]/div[2]/div/div[3]/p[1]/a/@href' ).extract() if file_download_url is None: file_download_url = response.xpath( '//*[@id="content_section"]/div[2]/div[1]/div[2]/p/a/@href' ).extract() file_download_url = file_download_url[0] file_download_url = "https://www.uniedu.go.kr" + file_download_url item[config['VARS']['VAR10']] = file_download_url item[config['VARS']['VAR9']] = file_name print("@@@@@@file name ", file_name) if file_icon.find("hwp") != -1: print('find hwp') yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item': item}) # else: yield scrapy.Request(file_download_url, callback=self.save_file, meta={ 'item': item, 'file_download_url': file_download_url, 'file_name': file_icon }, dont_filter=True) else: print("###############file does not exist#################") yield item
def parse_post(self, response): item = CrawlnkdbItem() # title = response.css('#main > table > thead > tr > th font::text').get() title = response.xpath( '//*[@id="kboard-default-document"]/div[2]/div[1]/p/text()').get() # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract() # body = response.css('.descArea')[0].get_text() body = response.xpath( '//*[@id="kboard-default-document"]/div[2]/div[3]/div/text()').get( ) if body is None: body = "No text" if body == '': body = "No text" # body = response.css('.descArea').xpath('string()').extract() date = response.xpath( '//*[@id="kboard-default-document"]/div[2]/div[2]/div[2]/div[2]/text()' ).get() writer = response.xpath( '//*[@id="kboard-default-document"]/div[2]/div[2]/div[1]/div[2]/text()' ).get() body_text = ''.join(body) top_category = response.xpath( '//*[@id="main"]/header/div/h1/text()').get() item[config['VARS']['VAR1']] = title.strip() item[config['VARS']['VAR4']] = date.strip() item[config['VARS']['VAR3']] = writer.strip() item[config['VARS']['VAR2']] = body_text.strip() item[config['VARS']['VAR5']] = "동국대학교 북한학연구소" item[config['VARS']['VAR6']] = "https://nkstudy.dongguk.edu" item[config['VARS']['VAR7']] = top_category file_name = title file_icon = response.xpath( '//*[@id="kboard-default-document"]/div[2]/div[4]/a/text()').get() file_icon = None if file_icon: file_download_url = response.xpath( ' //*[@id="kboard-default-document"]/div[2]/div[4]/a/@href' ).extract() file_download_url = file_download_url[0] item[config['VARS']['VAR10']] = file_download_url item[config['VARS']['VAR9']] = file_name print("@@@@@@file name ", file_name) if file_icon.find("hwp") != -1: print('find hwp') yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item': item}, dont_filter=True) # else: yield scrapy.Request(file_download_url, callback=self.save_file, meta={ 'item': item, 'file_download_url': file_download_url, 'file_name': file_icon }, dont_filter=True) else: print("###############file does not exist#################") yield item
def parse_post(self, response): item = CrawlnkdbItem() # title = response.css('#main > table > thead > tr > th font::text').get() title = response.xpath('//*[@id="cmsContent"]/div[1]/p/text()').get() #print(title) # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract() # body = response.css('.descArea')[0].get_text() body = response.xpath('//*[@id="tab_con"]').get() if body is None: body = "no text" body = re.sub('<script.*?>.*?</script>', '', body, 0, re.I | re.S) body = re.sub('<.+?>', '', body, 0, re.I | re.S) body = re.sub(' | |\t|\r|\n', " ", body) body = re.sub('\"', "'", body) file_download_url = response.xpath( '//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[5]/td/span/a/@href' ).extract() file_download_url = file_download_url[0] body = "본문 : " + body + " URL : " + file_download_url print(body) # body = response.css('.descArea').xpath('string()').extract() date = response.xpath( '//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[2]/td[1]/text()' ).get() #print(date) writer = response.xpath( '//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[1]/td/text()' ).get() #print(writer) body_text = ''.join(body) top_category = response.xpath( '//*[@id="container"]/div[3]/div[1]/div/h2/text()').get() item['post_title'] = title.strip() item['post_date'] = date.strip() item['post_writer'] = writer.strip() item['post_body'] = body_text.strip() item['published_institution'] = "통일연구원" item[ 'published_institution_url'] = "http://www.kinu.or.kr/www/jsp/prg/" item[config['VARS']['VAR7']] = top_category file_name = title file_icon = response.xpath( '//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[5]/td/span' ).get() file_icon = None # 첨부파일 막기 if file_icon: file_download_url = response.xpath( '//*[@id="cmsContent"]/div[2]/div[2]/table/tbody/tr[5]/td/span/a/@href' ).extract() file_download_url = file_download_url[0] item[config['VARS']['VAR10']] = file_download_url item[config['VARS']['VAR9']] = file_name print("@@@@@@file name ", file_name) if file_icon.find("hwp") != -1: print('find hwp') yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item': item}) # else: yield scrapy.Request(file_download_url, callback=self.save_file, meta={ 'item': item, 'file_download_url': file_download_url, 'file_name': file_icon }, dont_filter=True) else: print("###############file does not exist#################") yield item
def parse_post(self, response): item = CrawlnkdbItem() # title = response.css('#main > table > thead > tr > th font::text').get() title = response.xpath( '//*[@id="subConts"]/section/article/header/h1/text()').get() # table_text = response.css('#main > table > tbody > tr.boardview2 td::text').extract() # body = response.css('.descArea')[0].get_text() body = response.xpath( '//*[@id="subConts"]/section/article/section').get() body = re.sub('<script.*?>.*?</script>', '', body, 0, re.I | re.S) body = re.sub('<.+?>', '', body, 0, re.I | re.S) body = re.sub(' | |\t|\r|\n', " ", body) body = re.sub('\"', "'", body) #print(body) date = response.xpath( '//*[@id="subConts"]/section/article/header/address/p[2]/time/text()' ).get() writer = response.xpath( '//*[@id="subConts"]/section/article/header/address/p[1]/text()' ).get() body_text = ''.join(body) top_category = response.xpath('//*[@id="subConts"]/h1/text()').get() item['post_title'] = title.strip() item['post_date'] = date.strip() item['post_writer'] = writer.strip() item['post_body'] = body_text.strip() item['published_institution'] = "평화와 통일을 여는 사람들" item['published_institution_url'] = "http://www.spark946.org/data/" item[config['VARS']['VAR7']] = top_category file_name = title file_icon = response.xpath( '//*[@id="subConts"]/section/article/ul[1]/li/a/strong/text()' ).get() if file_icon: file_download_url = response.xpath( '//*[@id="subConts"]/section/article/ul[1]/li/a/@href' ).extract() file_download_url = file_download_url[0] file_download_url = "http://www.spark946.org/" + file_download_url item[config['VARS']['VAR10']] = file_download_url item[config['VARS']['VAR9']] = file_name print("@@@@@@file name ", file_name) if file_icon.find("hwp") != -1: print('find hwp') yield scrapy.Request(file_download_url, callback=self.save_file_hwp, meta={'item': item}) # else: yield scrapy.Request(file_download_url, callback=self.save_file, meta={ 'item': item, 'file_download_url': file_download_url, 'file_name': file_icon }, dont_filter=True) else: print("###############file does not exist#################") yield item