def __init__(self): spiderBase.__init__(self) self.url_base = 'http://www.sh.sgcc.com.cn/sdnw2010/' # self.logger = login.initLog('xz.log') print 'shanghaiSpider' self.session = self.init_session() self.folder_base = u'上海' js_str = common_utils.read_file_content('shanghai.js') self.jsEngineMgr = jsEngineMgr.initJsEngine() self.jsShowMenu2 = self.jsEngineMgr.eval(js_str.encode('utf-8')) # section 文件夹 self.section_folder_map = {} # 栏目对应的url self.section_url_map = {} self.post_data_map = {} self.referer_map = {} self.section_key = [ #国网要闻 'gwxw', #公司要闻 'gsyw', #本部新闻 'bbxw', #基层信息 'jcxx', #媒体报道 'mtbd', #行业资讯 'hyzx', ] #国网要闻 self.section_url_map[self.section_key[0]] = 'http://www.sh.sgcc.com.cn/sdnw2010/load.loadPage.d?page=sub.xml&siteCode=sdnw&urlChannelId=MjE5NDU3NjE%3D' #公司要闻 self.section_url_map[self.section_key[1]] = 'http://www.sh.sgcc.com.cn/sdnw2010/load.loadPage.d?page=sub_gsyw.xml&siteCode=sdnw' #本部新闻 self.section_url_map[self.section_key[2]] = 'http://www.sh.sgcc.com.cn/sdnw2010/load.loadPage.d?page=sub.xml&siteCode=sdnw&urlChannelId=MTAwMTg0' #基层信息 self.section_url_map[self.section_key[3]] = 'http://www.sh.sgcc.com.cn/sdnw2010/load.loadPage.d?page=sub.xml&siteCode=sdnw&urlChannelId=MTAwMTg3' #媒体报道 self.section_url_map[self.section_key[4]] = 'http://www.sh.sgcc.com.cn/sdnw2010/load.loadPage.d?page=sub.xml&siteCode=sdnw&urlChannelId=MTAwMTg1' #行业资讯 self.section_url_map[self.section_key[5]] = 'http://www.sh.sgcc.com.cn/sdnw2010/load.loadPage.d?page=sub.xml&siteCode=sdnw&urlChannelId=MTAwMTg2' #国网要闻 self.section_folder_map[self.section_key[0]] = u'国网要闻' #公司要闻 self.section_folder_map[self.section_key[1]] = u'公司要闻' #本部新闻 self.section_folder_map[self.section_key[2]] = u'本部新闻' #基层信息 self.section_folder_map[self.section_key[3]] = u'基层信息' #媒体报道 self.section_folder_map[self.section_key[4]] = u'媒体报道' #行业资讯 self.section_folder_map[self.section_key[5]] = u'行业资讯'
#if time.strptime( pubdt, '%Y-%m-%d' ).tm_year != 2015: #self.logger.error(u'不下载 发布时间:' + pubdt + 'u' + article_item.title) #return if contentHtml.status_code == requests.codes.ok: self.logger.info( u'返回成功') common_utils.write_to_file_with_stream( contentHtml.content, file_name) else: self.logger.error(u'下载失败!!!:' + file_name) if __name__ == '__main__': js_str = common_utils.read_file_content('shanghai.js') shanghai_spider = shanghaiSpider() shanghai_spider.init_log(u'上海.log') shanghai_spider.jsEngineMgr = jsEngineMgr.initJsEngine() shanghai_spider.jsShowMenu2 = shanghai_spider.jsEngineMgr.eval(js_str.encode('utf-8')) shanghai_spider.set_save_folder_path(globalconf.save_folder['shanghai']) shanghai_spider.init_mkdir_folder() str_limit_date = globalconf.spider_limit_date_time['shanghai'] shanghai_spider.set_limit_date_time(str_limit_date[0:8], str_limit_date[9:]) for section_item in shanghai_spider.section_key: shanghai_spider.logger.info(u"获取栏目:" + section_item + ":" + shanghai_spider.section_folder_map[section_item]) for page_num in range(shanghai_spider.page_number): article_list = shanghai_spider.stripy_article_list(section_item, page_num) for item in article_list: shanghai_spider.stripy_article_context(item)