def get_newsinfo(self, urls): ''' 访问每一条新闻详情 :param newslist: 新闻链接集合 :return: 新闻model ''' for url in urls: t_sleep() log('当前访问的URL', url) try: html = requests.get(url, timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 continue if html.status_code != 200: continue response = etree.HTML(html.text) item = self.parse_item(response, html.url) MogoMgr().insert(item)
def __init__(self): self.headers = {} self.date = self.get_date() self.mgr = MogoMgr() # self.retry = -1 # self.retry_flag = -1 self.failurls = []
def __init__(self): self.headers = {} self.mgr = MogoMgr() self.newslist = [] self.start_urls = [ 'http://www.amac.org.cn/xydt/xyxx/', ]
def __init__(self): self.headers = {} self.mgr = MogoMgr() self.newslist = [] self.start_urls = [ 'http://www.gov.cn/pushinfo/v150203/base_14px_pubdate.htm', ]
def __init__(self): self.headers = {} self.mgr = MogoMgr() self.newslist = [] self.start_urls = [ 'http://www.csrc.gov.cn/pub/zjhpublic/3300/3302/index_7401.htm', 'http://www.csrc.gov.cn/pub/zjhpublic/3300/3311/index_7401.htm', ]
def __init__(self): self.headers = {} self.mgr = MogoMgr() self.newslist = [] self.start_urls = [ 'http://www.circ.gov.cn/web/site0/tab5176/', 'http://www.circ.gov.cn/web/site0/tab7924/', 'http://www.circ.gov.cn/web/site0/tab5207/', ]
def __init__(self): self.headers = {} self.mgr = MogoMgr() self.newslist = [] self.retry = -1 self.retry_flag = -1 self.failurls = [] self.start_urls = [ 'http://www.amac.org.cn/flfg/flfgwb/', ]
def __init__(self): self.headers = {} self.mgr = MogoMgr() self.newslist = [] self.start_urls = [ 'http://www.mohurd.gov.cn/zcjd/index.html', 'http://www.mohurd.gov.cn/fdcy/fdcyzcfb/index.html', 'http://www.mohurd.gov.cn/fdcy/fdcyxydt/index.html', 'http://www.mohurd.gov.cn/fdcy/fdcydfxx/index.html', ]
def __init__(self): self.headers = {} self.mgr = MogoMgr()
def __init__(self): self.headers = {} self.mgr = MogoMgr() self.host_url = 'http://www.pbc.gov.cn'
def __init__(self): self.headers = {} self.mgr = MogoMgr() self.newslist = [] self.start_url = 'http://www.cbrc.gov.cn/chinese/zhengcefg.html'