def parse_one(self, response): item = ShanghaioneItem() item['title'] = re.findall(r'<h1>.*?</h1>', response.body)[0].decode("gbk")[4:-5] title_link = re.findall(r'<h1>.*?</h1>', response.body)[0].decode("gbk")[4:-5] m2 = hashlib.md5() m2.update(title_link.encode("utf-8")) item['md5'] = m2.hexdigest() times = re.findall(r"t =\'.*\'", response.body)[0][4:-1] try: time.strptime(times, "%Y-%m-%d") item['times'] = times except: item['times'] = "" #item['content'] = response.xpath("//div[@id='zoom']/div").extract()[0] if response.xpath("//div[@id='zoom']/div"): item['content'] = response.xpath( "//div[@id='zoom']/div").extract()[0] else: item['content'] = "" item['yuan'] = "广东省经济和信息化委员会" item["province"] = "广东" # print item yield item
def parse_one(self, response): item = ShanghaioneItem() item['title'] = response.xpath("//div[@class='nstit']/h1/text()").extract()[0] title_link = response.xpath("//div[@class='nstit']/h1/text()").extract()[0] m2 = hashlib.md5() m2.update(title_link.encode("utf-8")) item['md5'] = m2.hexdigest() times = response.xpath("//div[@class='nstimes0']/text()").extract()[0].strip().split()[2][3:] try: time.strptime(times, "%Y-%m-%d") item['times'] = times except: item['times'] = "" item['content'] = response.xpath("//div[@class='TRS_Editor']").extract()[0] item['yuan'] = "江苏省经济和信息化委员会" item["province"] = "江苏" yield item
def parse_one(self, response): item = ShanghaioneItem() item['title'] = response.xpath("//td[@class='title']/text()").extract()[0] title_link = response.xpath("//td[@class='title']/text()").extract()[0] m2 = hashlib.md5() m2.update(title_link.encode("utf-8")) item['md5'] = m2.hexdigest() times = response.xpath("//td[@style='line-height:20px;font-size:12px;']/text()").extract()[-1].replace("印发时间:","") try: time.strptime(times, "%Y-%m-%d") item['times'] = times except: item['times'] = "" item['content'] = response.xpath("//div[@id='zoom']").extract() item['yuan'] = "浙江省经济和信息化委员会" item["province"] = "浙江" yield item
def parse_one(self, response): item = ShanghaioneItem() item['title'] = response.xpath("//h1[@id='ivs_title']/text()").extract()[0] title_link = response.xpath("//h1[@id='ivs_title']/text()").extract()[0] m2 = hashlib.md5() m2.update(title_link.encode("utf-8")) item['md5'] = m2.hexdigest() a = response.xpath("//h3[@class='view_tit_1']/text()").extract()[0] times = re.findall(r"\d.*\d", a)[0] try: time.strptime(times, "%Y-%m-%d") item['times'] = times except: item['times'] = "" item['content'] = response.xpath("//div[@id='ivs_content']").extract()[0] item['yuan'] = "上海市经济和信息化委员会" item["province"] = "上海" yield item