def parse(self, response): if response.xpath('//*[@name="docdate"]'): title = response.xpath('//title/text()').extract_first() title = title.strip() indexNumber = response.xpath( '//*[@id="ess_ctr445_ModuleContent"]/table/tr[1]/td[2]/text()' ).extract_first() print('title', title) print('index', indexNumber) IssuedNumber = response.xpath( '//*[@id="ess_ctr445_ModuleContent"]/table/tr[3]/td[2]/text()[2]' ).extract_first() print('Issued', IssuedNumber) publishDate = response.xpath( '//*[@id="ess_ctr445_ModuleContent"]/table/tr[4]/td[2]/text()' ).extract_first() print('publish', publishDate) IssuedOrgan = response.xpath( '//*[@id="ess_ctr445_ModuleContent"]/table/tr[5]/td[4]/text()' ).extract_first() print('Issued', IssuedOrgan) te = textEdit() text, files = te.dealWithAll(response, id='ess_ctr445_ModuleContent') print('text', text) print('files', files) item_seventytwo = four.items.fillinData(title, '', '', '', '', '', indexNumber, '', IssuedOrgan, '', IssuedNumber, '', '', text, files, publishDate, '') yield item_seventytwo
def parse(self, response): title = response.xpath('//title/text()').extract_first() print(title) indexNumber = response.xpath( '//*[@id="headContainer"]/tbody/tr[1]/td/table/tr/td[1]/text()' ).extract_first() IssuingOrgan = response.xpath( '//*[@id="headContainer"]/tbody/tr[3]/td/table/tr/td[1]/span/text()' ).extract_first() publishDate = response.xpath( '//*[@id="headContainer"]/tbody/tr[3]/td/table/tr/td[2]/span/text()' ).extract_first() publishDate = re.sub('\D', '-', publishDate) publishDate = publishDate[:-1] print('indexNumber:%s' % indexNumber) print('IssuingOrgan:%s' % IssuingOrgan) print('publishDate:%s' % publishDate) te = textEdit() text, files = te.dealWithAll(response, classname="content") print('text:%s' % text) print('files:%s' % files) item_fortyfive = four.items.fillinData(title, '', '', '', '', '', indexNumber, '', IssuingOrgan, '', '', '', '', text, files, publishDate, '') yield item_fortyfive
def parse(self, response): title = response.xpath('//*[@name="title"]/@content').extract_first() print(title) publishDate = response.xpath('//*[@class="sp_time"]/font/text()').extract_first() publishDate = publishDate.split(':')[1] print(publishDate) te = textEdit() text,files = te.dealWithAll(response,id="zoom") print('text:%s'%text) print('file:%s'%files)
def parse(self, response): title = response.xpath('//h1') title = title.xpath('string(.)').extract_first() merge = response.xpath('//*[@class="lyd"]/text()').extract_first() publishDate = re.findall('\d{4}-\d{2}-\d{2}',merge)[0] te = textEdit() text,files = te.dealWithAll(response,'con') print('text:%s'%text) print('file:%s'%files) item_twentyeight = four.items.fillinData(title,'','','','','','','','','','','','',text,files,publishDate,'')
def parse(self, response): title = response.xpath('//*[@id="con_title"]/text()').extract_first() publishDate = response.xpath( '//*[@id="con_time"]/text()').extract_first() publishDate = re.sub('\D', '-', publishDate) publishDate = publishDate[:(len(publishDate) - 1)] te = textEdit() text, files = te.dealWithAll(response, id='con_con') item_twentynine = four.items.fillinData(title, '', '', '', '', '', '', '', '', '', '', '', '', text, files, publishDate, '') yield item_twentynine
def parse(self, response): title = response.xpath('//*[@class="pageHead"]/h2/text()').extract_first() print(title) publishDate = response.xpath('//*[@class="pageHead"]/h3/span[1]/text()').extract_first() print(publishDate) te = textEdit() text,files = te.dealWithAll(response,classname="view TRS_UEDITOR trs_paper_default trs_word trs_key4format") if not text: text,files = te.dealWithAll(response,classname="TRS_Editor") print('text:%s'%text) print('file:%s'%files) item_fortyone = four.items.fillinData(title,'','','','','','','','','','','','',text,files,publishDate,'') yield item_fortyone
def parse(self, response): title = response.xpath('//*[@name="ArticleTitle"]/@content').extract_first() print(title) publishDate = response.xpath('//*[@name="PubData"]/@content').extract_first() print(publishDate) source = response.xpath('//*[@name="ContentSource"]/@content').extract_first() print(source) te = textEdit() text,files = te.dealWithAll(response,id="zoom") print('text:%s'%text) print('file:%s'%files) item_fortythree = four.items.fillinData(title,'','','','','','','','','','','','',text,files,publishDate,source) yield item_fortythree
def parse(self, response): title = response.xpath( '//*[@class="zsy_cotitle"]/text()').extract_first() # print(title) publishDate = response.xpath( '//*[@class="zsy_cotitle"]/p/text()').extract_first() publishDate = re.findall('\d{4}-\d{2}-\d{2}', publishDate)[0] # print(publishDate) te = textEdit() text, files = te.dealWithAll(response, classname='zsy_comain') item_thirtyone = four.items.fillinData(title, '', '', '', '', '', '', '', '', '', '', '', '', text, files, publishDate, '') yield item_thirtyone
def parse(self, response): title = response.xpath('//h1/text()').extract_first() print(title) publishDate = response.xpath( '//*[@class="date"]/text()').extract_first() publishDate = re.sub('\D', '-', publishDate) publishDate = publishDate[:-1] te = textEdit() text, files = te.dealWithAll(response, id='forestry_content') print('text:%s' % text) print('file:%s' % files) item_thirtysix = four.items.fillinData(title, '', '', '', '', '', '', '', '', '', '', '', '', text, files, publishDate, '') yield item_thirtysix
def parse(self, response): title = response.xpath('//title/text()').extract_first() print(title) publishDate = response.xpath( '//*[@class="articleAuthor"]/span/strong/text()').extract_first() if publishDate: publishDate = publishDate.split(' ')[0] print(publishDate) te = textEdit() text, files = te.dealWithAll(response, classname="article art") print('text:%s' % text) print('file:%s' % files) item_thirtyeight = four.items.fillinData(title, '', '', '', '', '', '', '', '', '', '', '', '', text, files, publishDate, '') yield item_thirtyeight
def parse(self, response): title = response.xpath( '//*[@class="main_title"]/text()').extract_first() print('title', title) publishDate = response.xpath( '//*[@class="top_about"]/a/text()').extract_first() publishDate = publishDate.split(' ')[0] print('publish', publishDate) te = textEdit() text, files = te.dealWithAll(response, classname='content_word') print('text:%s' % text) print('file:%s' % files) item_two = four.items.fillinData(title, '', '', None, '', '', '', '', '', '', '', '', '', text, files, publishDate, '', response.meta['url']) yield item_two
def parse(self, response): title = response.xpath('//h1/text()').extract_first() print(title) publishDate = response.xpath( '//*[@class="caijingt_wztop"]/p/text()').extract_first() publishDate = publishDate.split(' ')[0] publishDate = re.sub('\D', '-', publishDate) publishDate = publishDate[:-1] print(publishDate) te = textEdit() text, files = te.dealWithAll(response, classname="caijingt_conmain") print('text:%s' % text) print('file:%s' % files) item_fortyone = four.items.fillinData(title, '', '', '', '', '', '', '', '', '', '', '', '', text, files, publishDate, '') yield item_fortyone
def parse(self, response): title = response.xpath('//title/text()').extract_first() print(title) merge = response.xpath( '//*[@class="detail_main_right_conbg_tit"]/div[3]/text()' ).extract_first() merge = merge.split(' ') publishDate = merge[0].split(':')[1] print('publishDate:%s' % publishDate) te = textEdit() text, files = te.dealWithAll(response, classname="detail_main_right_conbg_con") print('text:%s' % text) print('files:%s' % files) item_fortyfour = four.items.fillinData(title, '', '', '', '', '', '', '', '', '', '', '', '', text, files, publishDate, '') yield item_fortyfour
def parse(self, response): title = response.xpath('//h2/text()').extract_first() print('title', title) publishDate = response.xpath( '//*[@id="pubTime"]/text()').extract_first() publishDate = publishDate.split(' ')[0] publishDate = re.sub('\D', '-', publishDate) publishDate = publishDate[:-1] print('publish', publishDate) source = response.xpath('//*[@id="sourceName"]/text()').extract_first() print('source', source) te = textEdit() text, files = te.dealWithAll(response, classname='TRS_Editor') print('text:%s' % text) print('file:%s' % files) item_two = four.items.fillinData(title, '', '', None, '', '', '', '', '', '', '', '', '', text, files, publishDate, '', response.meta['url']) yield item_two
def parse(self, response): title = response.xpath('//h1') title = title.xpath('string(.)').extract_first() # title = response.xpath('//h1/text()').extract_first() print(title) source = response.xpath( '//*[@class="fl"]/span[1]/text()').extract_first() print(source) publishDate = response.xpath( '//*[@class="fl"]/span[3]/text()').extract_first() publishDate = publishDate.split(' ')[0] print(publishDate) te = textEdit() text, files = te.dealWithAll(response, classname="conbox2 boxcenter") print('text:%s' % text) print('file:%s' % files) item_thirtynine = four.items.fillinData(title, '', '', '', '', '', '', '', '', '', '', '', '', text, files, publishDate, '') yield item_thirtynine