示例#1
0
    def get_newsinfo(self, urls):
        '''
        访问每一条新闻详情
        :param newslist: 新闻链接集合
        :return: 新闻model
        '''
        for url in urls:
            t_sleep()
            log('当前访问的URL', url)

            try:
                html = requests.get(url, timeout=3)
                html.encoding = 'utf-8'
            except Exception as e:
                log_line('访问出错')
                print(e)
                self.__class__.retry = 1

                continue

            if html.status_code != 200:
                continue

            response = etree.HTML(html.text)

            item = self.parse_item(response, html.url)
            MogoMgr().insert(item)
示例#2
0
文件: zqrb.py 项目: NickLeeCoder/ttj
 def __init__(self):
     self.headers = {}
     self.date = self.get_date()
     self.mgr = MogoMgr()
     # self.retry = -1
     # self.retry_flag = -1
     self.failurls = []
示例#3
0
文件: amac2.py 项目: NickLeeCoder/ttj
 def __init__(self):
     self.headers = {}
     self.mgr = MogoMgr()
     self.newslist = []
     self.start_urls = [
         'http://www.amac.org.cn/xydt/xyxx/',
     ]
示例#4
0
 def __init__(self):
     self.headers = {}
     self.mgr = MogoMgr()
     self.newslist = []
     self.start_urls = [
         'http://www.gov.cn/pushinfo/v150203/base_14px_pubdate.htm',
     ]
示例#5
0
文件: csrc.py 项目: NickLeeCoder/ttj
 def __init__(self):
     self.headers = {}
     self.mgr = MogoMgr()
     self.newslist = []
     self.start_urls = [
         'http://www.csrc.gov.cn/pub/zjhpublic/3300/3302/index_7401.htm',
         'http://www.csrc.gov.cn/pub/zjhpublic/3300/3311/index_7401.htm',
     ]
示例#6
0
文件: circ.py 项目: NickLeeCoder/ttj
 def __init__(self):
     self.headers = {}
     self.mgr = MogoMgr()
     self.newslist = []
     self.start_urls = [
         'http://www.circ.gov.cn/web/site0/tab5176/',
         'http://www.circ.gov.cn/web/site0/tab7924/',
         'http://www.circ.gov.cn/web/site0/tab5207/',
     ]
示例#7
0
 def __init__(self):
     self.headers = {}
     self.mgr = MogoMgr()
     self.newslist = []
     self.retry = -1
     self.retry_flag = -1
     self.failurls = []
     self.start_urls = [
         'http://www.amac.org.cn/flfg/flfgwb/',
     ]
示例#8
0
 def __init__(self):
     self.headers = {}
     self.mgr = MogoMgr()
     self.newslist = []
     self.start_urls = [
         'http://www.mohurd.gov.cn/zcjd/index.html',
         'http://www.mohurd.gov.cn/fdcy/fdcyzcfb/index.html',
         'http://www.mohurd.gov.cn/fdcy/fdcyxydt/index.html',
         'http://www.mohurd.gov.cn/fdcy/fdcydfxx/index.html',
     ]
示例#9
0
 def __init__(self):
     self.headers = {}
     self.mgr = MogoMgr()
示例#10
0
文件: pbc.py 项目: NickLeeCoder/ttj
 def __init__(self):
     self.headers = {}
     self.mgr = MogoMgr()
     self.host_url = 'http://www.pbc.gov.cn'
示例#11
0
 def __init__(self):
     self.headers = {}
     self.mgr = MogoMgr()
     self.newslist = []
     self.start_url = 'http://www.cbrc.gov.cn/chinese/zhengcefg.html'