def get_lunbo(self): ''' 财经版面 :return: ''' url = 'http://www.news.cn/fortune/' html = requests.get(url, headers=self.get_caijing_header()) html.encoding = 'utf-8' html = etree.HTML(html.text) urls = html.xpath('//div[@class="swiper-slide"]/a/@href') year = arrow.now().date().year news_list = [] for url in urls: if str(year) in url: log('需要访问的URL 轮播图', url) find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_iteminfo(url) if news == 'timeout' or news == 'error': continue news_list.append(news) return news_list
def parser_data(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout', 'timeout' if html.status_code != 200: return 'error', 'error' response = etree.HTML(html.text) con_list = response.xpath( '//div[@class="ldContent"]/descendant-or-self::*/text()') content = ''.join(con_list).strip() date = response.xpath('//div[@class="ldDate"]/text()')[0] date = date.split(':')[1] # log('内容', content) return date, content
def get_content(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' response = etree.HTML(html.text) con_list = response.xpath( '//div[@class="union"]/descendant-or-self::*/text()') return ''.join(con_list).strip()
def get_newsinfo(self, urls): ''' 访问每一条新闻详情 :param newslist: 新闻链接集合 :return: 新闻model ''' for url in urls: t_sleep() log('当前访问的URL', url) try: html = requests.get(url, timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 continue if html.status_code != 200: continue response = etree.HTML(html.text) item = self.parse_item(response, html.url) MogoMgr().insert(item)
def get_content(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' if html.status_code != 200: return 'error' response = etree.HTML(html.text) return self.parse_item(response)
def get_iteminfo(self, url): ''' 访问每一条新闻详情 :param itemlist: 新闻链接集合 :return: 新闻model ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) return 'timeout' if html.status_code != 200: log('访问的URL出错!!!', url) return 'error' response = etree.HTML(html.text) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'xinhua' return news
def get_newsinfo(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() log('当前访问的URL', url) try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' if html.status_code != 200: log_line('请求状态不是200') return 'error' response = etree.HTML(html.text) self.parse_item(response) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) news.spider_name = 'bjjrj' return news
def re_send(cls): if cls.retry != -1 and cls.retry_flag == -1: log_line('部分新闻访问出错 再次进行访问') log('再次运行的爬虫类型', cls) cls.retry_flag = 1 cls().run()
def get_newsinfo(self, url): ''' 请求每一个新闻详情 :param url: :return: ''' t_sleep() try: html = requests.get(url, headers=self.get_news_header(), timeout=3) html.encoding = 'gbk' except Exception as e: log_line('访问出错') print(e) return 'timeout' response = etree.HTML(html.text) log('当前访问的URL', url) title, date, content = self.parse_item(response) news = News(title=title, date=date, content=content, url=url) return news
def get_itemlist(self, page='1'): ''' 获取新华财经 所有新闻详情 :return: 返回新闻model ''' # 新华财经 - 新闻列表 url = 'http://qc.wa.news.cn/nodeart/list?nid=11147664&pgnum={0}&cnt=16&tp=1&orderby=1'.format(page) html = requests.get(url, headers=self.get_newlist_header()) items = json.loads(html.text[1:-1]) items = items['data']['list'] news_list = [] for item in items: # 避免重复请求 find_one = self.mgr.find_one('url', item['LinkUrl']) if find_one is not None: log_line('该URL已经存在 无需请求') log(item['LinkUrl']) continue news = self.get_iteminfo(item['LinkUrl']) if news == 'timeout' or news == 'error': continue news_list.append(news) return news_list
def run(self): log_line('ZqrbSpider 启动!!!') url = self.get_start_url() urls = self.get_html(url) news_list = self.send_request(urls) for news in news_list: self.mgr.insert(news) self.__class__().re_send()
def run(self): log_line('StcnSpider 启动!!!') url = 'http://www.stcn.com/' urls = self.get_html(url) news_list = self.send_request(urls) for news in news_list: self.mgr.insert(news) self.__class__().re_send()
def run(self): log_line('ShangHaiSpider 启动!!!') url = 'http://www.shanghai.gov.cn/nw2/nw2314/nw2319/nw41893/index.html' urls = self.get_html(url) news_list = self.send_request(urls) for news in news_list: self.mgr.insert(news) self.__class__().re_send()
def send_request(self, urls): news_list = [] for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_newsinfo(url) news_list.append(news) return news_list
def send_request(self, urls): for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue date, content = self.parser_data(url) if content in ('error', 'timeout'): continue self.update_news(url, content, date)
def send_request(self, urls): for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue content = self.get_content(url) if content == 'error' or content == 'timeout': continue self.update_content(url, content)
def run(self): log_line('CbrcSpider 启动!!!') urls = self.get_html(self.start_url) self.send_request(urls) for news in self.newslist: find_one = self.mgr.find_one('url', news.url) if find_one is not None: log_line('该URL已经存在 无需写入') log(news.url) continue self.mgr.insert(news) self.__class__().re_send()
def run(self): log_line('CsSpider 启动!!!') start_urls = [ 'http://www.cs.com.cn/', ] for url in start_urls: urls = self.get_html(url) news_list = self.send_request(urls) for news in news_list: self.mgr.insert(news) self.__class__().re_send()
def send_request(self, urls): for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue content = self.get_content(url) if content == 'timeout' or 'error': continue for news in self.newslist: if news.url == url: news.content = content
def run(self): log_line('CctvSpider 启动!!!') urls = [] url = 'http://jingji.cctv.com/' urls_1 = self.get_html(url) urls_2 = self.get_jsondata() urls.extend(urls_1) urls.extend(urls_2) urls = set(urls) news_list = self.send_request(urls) for news in news_list: self.mgr.insert(news) self.__class__().re_send()
def run(self): log_line('XinHuaSpider 启动!!!') news_list = [] # 对财经版面的前两页数据进行爬取 news_list_1 = self.get_itemlist(page='1') news_list_2 = self.get_itemlist(page='2') news_list_3 = self.get_lunbo() news_list_4 = self.get_money() news_list.extend(news_list_1) news_list.extend(news_list_2) news_list.extend(news_list_3) news_list.extend(news_list_4) for news in news_list: self.mgr.insert(news) self.__class__().re_send()
def run(self): log_line('BjjrjSpider 启动!!!') urls = [] url = 'http://www.bjjrj.gov.cn/zcfg/c19-list-1.html' urls_1 = self.get_html(url) url = 'http://www.bjjrj.gov.cn/zyzc/c138-list-1.html' urls_2 = self.get_html(url) urls.extend(urls_1) urls.extend(urls_2) news_list = self.send_request(urls) log_line(len(news_list)) for news in news_list: self.mgr.insert(news) self.__class__().re_send()
def run(self): log_line('CircSpider 启动!!!') for url in self.start_urls: self.get_html(url) self.send_request(self.get_newsUrls()) # for news in self.newslist: # log(news.url, news.content) # for news in self.newslist: find_one = self.mgr.find_one('url', news.url) if find_one is not None: log_line('该URL已经存在 无需写入') log(news.url) continue self.mgr.insert(news) self.__class__().re_send()
def send_request(self, urls): news_list = [] for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_newsinfo(url) if news == 'error': log('访问的新闻不存在 继续访问下一个URL') continue if news == 'timeout': log('访问的新闻超时 暂时跳过') continue news_list.append(news) return news_list
def get_money(self): ''' 金融版面 :return: ''' url = 'http://www.xinhuanet.com/money/index.htm' html = requests.get(url, headers=self.get_news_header()) html.encoding = 'utf-8' html = etree.HTML(html.text) urls_all = [] urls_1 = html.xpath('//li[@class="clearfix"]/h3/a/@href') # 只对新闻列表进行处理 urls_2 = html.xpath('//li[@class="imp"]/a/@href') urls_3 = html.xpath('//div[@class="swiper-slide"]/a/@href') urls_all.extend(urls_1) urls_all.extend(urls_2) urls_all.extend(urls_3) # log(len(urls_all), urls_all) news_list = [] for url in urls_all: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_iteminfo(url) if news == 'timeout' or news == 'error': continue news_list.append(news) return news_list
def run(self): log_line('PbcSpider 启动!!!') # 公告信息 dest_url = 'http://www.pbc.gov.cn/rmyh/105208/index.html' self.send(dest_url, self.parser_gonggao_list, self.parse_gonggao_item) # 法律法规 dest_url = 'http://www.pbc.gov.cn/tiaofasi/144941/index.html' self.send(dest_url, self.parser_falvfagui, self.parser_common_item) # 货币政策 暂未完成 # dest_url = 'http://www.pbc.gov.cn/rmyh/105145/index.html' # self.send(dest_url, self.parser_xindai, self.parser_common_item) # 信贷政策 dest_url = 'http://www.pbc.gov.cn/jinrongshichangsi/147160/147289/index.html' self.send(dest_url, self.parser_xindai, self.parser_common_item) self.__class__().re_send()
def send_request(self, urls, parser_item_fuc): ''' 用于请求每一个具体的新闻链接 :param urls: 具体新闻URL :param parser_item_fuc: 用于解析每一个新闻详情的函数 :return: 返回解析好的News类型列表 ''' news_list = [] for url in urls: # 避免重复请求 find_one = self.mgr.find_one('url', url) if find_one is not None: log_line('该URL已经存在 无需请求') log(url) continue news = self.get_newsinfo(url, parser_item_fuc) if news == 'error' or news == 'timeout': continue news_list.append(news) return news_list
def run(self): log_line('FangChanSpider 启动!!!') start_urls = [ 'http://www.fangchan.com/policy/28/', 'http://www.fangchan.com/plus/nlist.php?tid=2&tags=%E5%8E%9F%E5%88%9B', 'http://www.fangchan.com/plus/nlist.php?tid=2&column=%E5%AE%8F%E8%A7%82', 'http://www.fangchan.com/news/6/', 'http://www.fangchan.com/news/1/', 'http://www.fangchan.com/news/9/', 'http://www.fangchan.com/news/5/', 'http://www.fangchan.com/news/7/', 'http://www.fangchan.com/news/4/', ] for url in start_urls: urls = self.get_html(url) news_list = self.send_request(urls) for news in news_list: self.mgr.insert(news) self.__class__().re_send()
def get_html(self, dest_url): ''' 解码PBC的JavaScript脚本 并再次访问获取原始HTML :param url: 需要访问的PBC链接 :return: HTML源码 requests中的 response 类型 ''' r = requests.session() # dest_url = 'http://www.pbc.gov.cn/rmyh/105208/index.html' # dest_url = 'http://www.pbc.gov.cn/tiaofasi/144941/index.html' # dest_url = 'http://www.pbc.gov.cn/rmyh/105145/index.html' # dest_url = 'http://www.pbc.gov.cn/jinrongshichangsi/147160/147289/index.html' # 利用session保存cookie信息,第一次请求会设置cookie类似{'wzwsconfirm': 'ab3039756ba3ee041f7e68f634d28882', 'wzwsvtime': '1488938461'},与js解析得到的cookie合起来才能通过验证 # r = requests.session() content = r.get(dest_url).content # 获取页面脚本内容 re_script = re.search(r'<script type="text/javascript">(?P<script>.*)</script>', content.decode('utf-8'), flags=re.DOTALL) # 用点匹配所有字符,用(?P<name>...)获取:https://docs.python.org/3/howto/regex.html#regex-howto # cheatsheet:https://github.com/tartley/python-regex-cheatsheet/blob/master/cheatsheet.rst script = re_script.group('script') script = script.replace('\r\n', '') # 在美化之前,去掉\r\n之类的字符才有更好的效果 res = jsbeautifier.beautify(script) # 美化并一定程度解析js代码:https://github.com/beautify-web/js-beautify with open('x.js', 'w') as f: f.write(res) # 写入文档进行查看分析 jscode_list = res.split('function') var_ = jscode_list[0] var_list = var_.split('\n') template_js = var_list[3] # 依顺序获取,亦可用正则 template_py = js2py.eval_js(template_js) # 将所有全局变量插入第一个函数变为局部变量并计算 function1_js = 'function' + jscode_list[1] position = function1_js.index('{') + 1 function1_js = function1_js[:position] + var_ + function1_js[position:] function1_py = js2py.eval_js(function1_js) cookie1 = function1_py(str(template_py)) # 结果类似'NA==' # 保存得到的第一个cookie cookies = {} cookies['wzwstemplate'] = cookie1 # 对第三个函数做类似操作 function3_js = 'function' + jscode_list[3] position = function3_js.index('{') + 1 function3_js = function3_js[:position] + var_ + function3_js[position:] function3_py = js2py.eval_js(function3_js) middle_var = function3_py() # 是一个str变量,结果类似'WZWS_CONFIRM_PREFIX_LABEL4132209' cookie2 = function1_py(middle_var) cookies['wzwschallenge'] = cookie2 # 关于js代码中的document.cookie参见 https://developer.mozilla.org/zh-CN/docs/Web/API/Document/cookie dynamicurl = js2py.eval_js(var_list[0]) # 利用新的cookie对提供的动态网址进行访问即是我们要达到的内容页面了 r.cookies.update(cookies) # content = r.get(self.host_url + dynamicurl).content.decode('utf-8') try: content = r.get(self.host_url + dynamicurl, timeout=3) content.encoding = 'utf-8' except Exception as e: log_line('访问出错') print(e) self.__class__.retry = 1 return 'timeout' return content
def run(self): log_line('JingJiSpider 启动!!!') news_list = self.get_newslist() self.get_newsinfo(news_list)