def parse(self, response): sz_div = response.xpath("//div[@class='twolbta']/div[2]") # 省长信息 if sz_div: item = GovPeopleItem() names = sz_div.xpath("./a/text()").extract_first().split(":") item['name'] = names[1] item['position'] = names[0] item['department'] = '省政府领导' item['people_url'] = sz_div.xpath("./a/@href").extract_first() item['province'] = '黑龙江省' item['city'] = '' # print(item) yield item fsz_li_list = response.xpath("//div[@class='twolbtb']/ul/li") # 副省长信息 for li in fsz_li_list: item = GovPeopleItem() item['name'] = li.xpath("./a/text()").extract_first() item['position'] = '副省长' item['department'] = '省政府领导' item['people_url'] = li.xpath("./a/@href").extract_first() item['province'] = '黑龙江省' item['city'] = '' # print(item) yield item other_url = response.xpath( "//div[@class='twolbbot']/a/@href").extract_first() if other_url: yield scrapy.Request(other_url, callback=self.other_leader)
def parse(self, response): dl_1 = response.xpath( "//div[@class='province_list ld_ys_inherit']/dl[1]") if dl_1: item = GovPeopleItem() item['position'] = dl_1.xpath("./dt/text()").extract_first() item['name'] = response.xpath( "//div[@class='province_list ld_ys_inherit']/div[1]//h2/a/text()" ).extract_first() item['people_url'] = response.urljoin( response.xpath( "//div[@class='province_list ld_ys_inherit']/div[1]//h2/a/@href" ).extract_first()) item['department'] = '省政府领导' item['province'] = '吉林省' item['city'] = '' # print(1,item) yield item dl_2 = response.xpath( "//div[@class='province_list ld_ys_inherit']/dl[position()=2]") if dl_2: position = "副省长" li_list = response.xpath( "//div[@class='province_list ld_ys_inherit']/ul[position()<=2]/li" ) for li in li_list: item = GovPeopleItem() item['name'] = li.xpath("./a/p/text()").extract_first() item['position'] = position item['people_url'] = response.urljoin( li.xpath("./a/@href").extract_first()) item['department'] = '省政府领导' item['province'] = '吉林省' item['city'] = '' # print(2,item) yield item dl_3 = response.xpath( "//div[@class='province_list ld_ys_inherit']/dl[position()=4]") if dl_3: position = "秘书长" li_list = response.xpath( "//div[@class='province_list ld_ys_inherit']/ul[position()=3]/li" ) for li in li_list: item = GovPeopleItem() item['name'] = li.xpath("./a/p/text()").extract_first() item['position'] = position item['people_url'] = response.urljoin( li.xpath("./a/@href").extract_first()) item['department'] = '省政府领导' item['province'] = '吉林省' item['city'] = '' # print(3,item) yield item
def parse(self, response): left_position = response.xpath( "//div[@class='zwgk_ldjl_zx_detail left']/div/span/text()" ).extract_first() # 左侧职位名称 if left_position: left_position = left_position.replace(":", "") item = GovPeopleItem() item['position'] = left_position item['name'] = response.xpath( "//div[@class='zwgk_ldjl_zx_detail left']/div/a/text()" ).extract_first() item['people_url'] = response.urljoin( response.xpath( "//div[@class='zwgk_ldjl_zx_detail left']/div/a/@href"). extract_first()) item['department'] = '自治区政府' item['province'] = '内蒙古' item['city'] = '' # print(item) yield item right_div_list = response.xpath( "//div[@class='zwgk_ldjl_zx_right left']/div[position()<=2]") for div in right_div_list: position = div.xpath("./div[1]/text()").extract_first().replace( ":", "") li_list = div.xpath("./div[2]//li") if li_list: for li in li_list: item = GovPeopleItem() item['position'] = position item['name'] = li.xpath("./a/text()").extract_first() item['people_url'] = response.urljoin( li.xpath("./a/@href").extract_first()) item['department'] = '自治区政府' item['province'] = '内蒙古' item['city'] = '' # print(item) yield item else: item = GovPeopleItem() item['position'] = position item['name'] = div.xpath("./a/text()").extract_first() item['people_url'] = response.urljoin( div.xpath("./a/@href").extract_first()) item['department'] = '自治区政府' item['province'] = '内蒙古' item['city'] = '' # print(item) yield item
def parse_detai(self, response): department = response.meta['department'] title = response.xpath("//title/text()").extract_first() name_and_position = title.replace("-领导-首都之窗-北京市政务门户网站", "").split('-') item = GovPeopleItem() item['name'] = name_and_position[0] item['position'] = name_and_position[1] item['province'] = '北京' item['city'] = '北京' item['department'] = department item['people_url'] = response.url # print(item) yield item
def other_leader(self, response): div_list = response.xpath("//div[@class='f000 twolmain']") for div in div_list: item = GovPeopleItem() msg = div.xpath("./div[1]/text()").extract_first().split(' ') item['name'] = msg[1] item['position'] = msg[0] item['department'] = '省政府领导' item['people_url'] = '' item['province'] = '黑龙江省' item['city'] = '' # print(item) yield item
def parse(self, response): li_list = response.xpath("//div[@class='l-box-right']/ul/li") for li in li_list: names = li.xpath("./a/text()").extract_first().split(" ") item = GovPeopleItem() item['position'] = names[1].replace(u'\u3000', '') item['department'] = '省政府领导' item['name'] = names[0].replace(u'\u3000', '') item['people_url'] = response.urljoin( li.xpath("./a/@href").extract_first()) item['province'] = '辽宁省' item['city'] = '' # print(item) yield item
def parse(self, response): li_list = response.xpath("//div[@class='left_zhong']/ul/li") for li in li_list: names = li.xpath("./a/text()").extract_first().split(' ') item = GovPeopleItem() item['name'] = names[1] item['position'] = names[0] item['department'] = '省政府领导' item['province'] = '河北省' item['city'] = '' item['people_url'] = response.urljoin( li.xpath("/a/@href").extract_first()) # print(item) yield item
def parse(self, response): p_list = response.xpath("//div[@id='Tab1-1']/p[position()<=3]") for p in p_list: position = p.xpath("./text()").extract_first().replace(':', '').replace( ' ', '') # print(position) a_list = p.xpath("./a") for a in a_list: item = GovPeopleItem() item['name'] = a.xpath("./text()").extract_first().replace( u'\u3000', '') item['people_url'] = a.xpath("./@href").extract_first() item['position'] = position item['department'] = "市政府领导" item['province'] = '上海' item['city'] = '上海' # print(item) yield item p_list2 = response.xpath( "//div[@id='Tab1-1']/p[position()=4 or position()=5]") for p in p_list2: # \xa0 \u3000 p_str = p.xpath("./text()").extract_first().split(':') position2 = p_str[0] name_list = p_str[1].replace(u'\u3000', '').split(u"\xa0\xa0\xa0\xa0\xa0") for name in name_list: item = GovPeopleItem() item['name'] = name item['people_url'] = '' item['position'] = position2 item['department'] = "市政府领导" item['province'] = '上海' item['city'] = '上海' # print(item) yield item
def parse(self, response): li_list = response.xpath( "//ul[@class='provincial-leaders-inner oflow-hd']/li") for li in li_list: position = li.xpath("./div/text()").extract_first() position = position.replace(u"\xa0", "") dl_list = li.xpath("./dl") for dl in dl_list: item = GovPeopleItem() item['people_url'] = dl.xpath("./dd/a/@href").extract_first() item['people_url'] = response.urljoin(item['people_url']) item['name'] = dl.xpath( "./dd/a/text()").extract_first().replace(u"\u3000", "") item['province'] = '山西省' item['city'] = '' item['department'] = '省政府领导' item['position'] = position # print(item) yield item
def parse(self, response): div_list = response.xpath("//div[@class='ld-205']") for div in div_list: position = div.xpath( "./div[@class='ld-zw']/text()").extract_first() if position is None: position = '副省长' li_list = div.xpath("./div[@class='ld-xx']//li") for li in li_list: item = GovPeopleItem() item['name'] = li.xpath( "./a/div[2]/text()").extract_first().replace(u"\xa0", "") item['position'] = position.replace(u'\xa0', '') item['department'] = '省政府领导' item['people_url'] = response.urljoin( li.xpath("./a/@href").extract_first()) item['province'] = '江苏省' item['city'] = '' # print(item) yield item