def crawler(catalog): result_list = [] param['catalog'] = catalog response = request_site_page(url, params=param) if response is None: logger.error('网页请求错误{}'.format(url, params=param)) soup = bs(response.content if response else '', 'lxml') page_count_text = soup.body.find(class_='pagediv') page_count = int( re.search(r'共 (\d+) 页', page_count_text.text if page_count_text else '').group(1)) logger.info('{} 一共有{}页'.format(gov_name, page_count)) for num in range(page_count): param['page'] = num + 1 try: response = request_site_page(url, params=param) if response is None: logger.error('网页请求错误{}'.format(url)) soup = bs(response.content if response else '', 'html5lib') for x in soup.find('table').find_all('tr'): if x.td.div: continue anc_url = 'http://pub.gdepb.gov.cn/pub/pubcatalog/extranet_pub_document_view.jsp?docId=' anc_url += re.search(r'doView\(\'(\d+)\'\)', str(x)).group(1).strip() if db[collection_name].count_documents({'url': anc_url}) != 0: return info = { 'title': x.find('a').text.strip(), 'publishDate': x.find_all('td')[-2].text.replace('年', '-').replace( '月', '-').replace('日', '').strip(), 'url': anc_url, 'type': '行政处罚决定', 'origin': gov_name, 'status': 'not parsed' } logger.info('{} 新公告:{}'.format(gov_name, info['title'])) if info not in result_list: result_list.append(info) except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue if len(result_list) > 0: logger.info('{}一共有{}条新公告,导入数据库中......'.format(gov_name, len(result_list))) r = db[collection_name].insert_many(result_list) if len(r.inserted_ids) == len(result_list): logger.info('{}公告导入完成!'.format(gov_name)) else: logger.error('{}公告导入出现问题!'.format(gov_name)) else: logger.info('{}没有新公告!'.format(gov_name))
def crawler(): result_list = [] response = request_site_page(url, methods='post', params=param, headers=header) if response is None: logger.error('网页请求错误{}'.format(url)) if response is None: return data = response.json() page_count = math.ceil(data['total'] / 20) logger.info('{} 一共有{}页'.format(gov_name, page_count)) for num in range(page_count): temp = copy.deepcopy(param) temp['pageNumber'] = num + 1 try: response = request_site_page(url, methods='post', params=temp, headers=header) if response is None: logger.error('网页请求错误{}'.format(url)) return response.encoding = 'utf8' for x in response.json()['rows']: anc_url = 'http://portal.lncredit.gov.cn/TestUsers/website/ln210000/wsreportingdoublepublicityquery/punishdetail?id=' anc_url += x['id'] publish_date = re.search(r'\d+\-\d+\-\d+', x['uploadtime']).group() if db[collection_name].count_documents({'url': anc_url}) != 0: return info = { 'title': x['punishname'].strip(), 'publishDate': publish_date, 'url': anc_url, 'type': '行政处罚决定', 'origin': gov_name, 'status': 'not parsed' } logger.info('{} 新公告:{}'.format(gov_name, info['title'])) if info not in result_list: result_list.append(info) except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue if len(result_list) > 0: logger.info('{}一共有{}条新公告,导入数据库中......'.format(gov_name, len(result_list))) r = db[collection_name].insert_many(result_list) if len(r.inserted_ids) == len(result_list): logger.info('{}公告导入完成!'.format(gov_name)) else: logger.error('{}公告导入出现问题!'.format(gov_name)) else: logger.info('{}没有新公告!'.format(gov_name))
def crawler(url_format): result_list = list() response = request_site_page(url_format.format('')) if response is None: logger.error('网页请求错误{}'.format(url_format.format(''))) soup = bs(response.content if response else '', 'lxml') page_count_text = soup.body.find(class_='font_hui14') page_count = int( re.findall(r'countPage = (\d+)', page_count_text.text if page_count_text else '')[-1]) logger.info('{} 一共有{}页'.format(gov_name, page_count)) for num in range(page_count): url = url_format.format('_' + str(num) if num != 0 else '') try: response = request_site_page(url) if response is None: logger.error('网页请求错误{}'.format(url)) soup = bs(response.content if response else '', 'lxml') info = dict() for x in soup.find_all(class_='font_hei15_1'): if x.a: anc_url = x.a.attrs['href'].strip() if not anc_url.startswith('http'): anc_url = urljoin(url_format.format(''), anc_url) if db[collection_name].count_documents({'url': anc_url }) != 0: return info = { 'title': x.a.text, 'url': anc_url, 'type': '行政处罚决定', 'origin': gov_name, 'status': 'not parsed' } else: info['publishDate'] = x.text if url_format == url_format_list[1] and '举报案件' not in info[ 'title']: continue logger.info('{} 新公告:{}'.format(gov_name, info['title'])) if info not in result_list: result_list.append(info) except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue if len(result_list) > 0: logger.info('{}一共有{}条新公告,导入数据库中......'.format(gov_name, len(result_list))) r = db[collection_name].insert_many(result_list) if len(r.inserted_ids) == len(result_list): logger.info('{}公告导入完成!'.format(gov_name)) else: logger.error('{}公告导入出现问题!'.format(gov_name)) else: logger.info('{}没有新公告!'.format(gov_name))
def crawler(url_format): result_list = [] response = request_site_page(url_format.format('')) if response is None: logger.error('网页请求错误{}'.format(url_format.format(''))) soup = bs(response.content if response else '', 'lxml') page_count_text = soup.body.find(class_='data-page') page_count = int( re.findall(r'createPageHTML\((\d+)', str(page_count_text) if page_count_text else '')[-1]) logger.info('{} 一共有{}页'.format(gov_name, page_count)) for num in range(page_count): url = url_format.format('_' + str(num) if num != 0 else '') try: response = request_site_page(url) if response is None: logger.error('网页请求错误{}'.format(url)) soup = bs(response.content if response else '', 'lxml') for x in soup.body.find(class_='data-list').find_all('p'): anc_url = x.a.attrs['href'].strip() if not anc_url.startswith('http'): anc_url = urljoin(url_format.format(''), anc_url) if db[collection_name].count_documents({'url': anc_url}) != 0: return info = { 'title': x.a.text.strip(), 'publishDate': x.find_all('span')[-1].text.strip().replace('.', '-'), 'url': anc_url, 'type': '行政处罚决定', 'origin': gov_name, 'status': 'not parsed' } logger.info('{} 新公告:{}'.format(gov_name, info['title'])) if info not in result_list: result_list.append(info) except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue if len(result_list) > 0: logger.info('{}一共有{}条新公告,导入数据库中......'.format(gov_name, len(result_list))) r = db[collection_name].insert_many(result_list) if len(r.inserted_ids) == len(result_list): logger.info('{}公告导入完成!'.format(gov_name)) else: logger.error('{}公告导入出现问题!'.format(gov_name)) else: logger.info('{}没有新公告!'.format(gov_name))
def crawler(): result_list = [] response = request_site_page(url_format.format('')) if response is None: logger.error('网页请求错误{}'.format(url_format.format(''))) page_count = math.ceil( int( re.findall(r'var m_nRecordCount = "(\d+)";', response.text if response else '')[-1]) / 20) logger.info('{} 一共有{}页'.format(gov_name, page_count)) for num in range(page_count): url = url_format.format('_' + str(num) if num != 0 else '') try: response = request_site_page(url) if response is None: logger.error('网页请求错误{}'.format(url)) soup = bs(response.content if response else '', 'lxml') for x in soup.body.find_all(class_='row'): temp = x.find(class_='mc').a publish_date = str(x.find(class_='fbrq').text).replace( '年', '-').replace('月', '-').replace('日', '') anc_url = temp.attrs['href'].strip() if not anc_url.startswith('http'): anc_url = urljoin(url_format.format(''), anc_url) if db[collection_name].count_documents({'url': anc_url}) != 0: return if '违法' not in str(temp.text): continue info = { 'title': temp.text, 'publishDate': publish_date, 'url': anc_url, 'type': '行政处罚决定', 'origin': gov_name, 'status': 'not parsed' } logger.info('{} 新公告:{}'.format(gov_name, info['title'])) if info not in result_list: result_list.append(info) except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue if len(result_list) > 0: logger.info('{}一共有{}条新公告,导入数据库中......'.format(gov_name, len(result_list))) r = db[collection_name].insert_many(result_list) if len(r.inserted_ids) == len(result_list): logger.info('{}公告导入完成!'.format(gov_name)) else: logger.error('{}公告导入出现问题!'.format(gov_name)) else: logger.info('{}没有新公告!'.format(gov_name))
def crawler(): result_list = [] response = request_site_page(url_format.format('1')) if response is None: logger.error('网页请求错误{}'.format(url_format.format('1'))) soup = bs(response.content if response else '', 'lxml') page_count_text = soup.find(class_='pdlist').div.find_all('a') if len(page_count_text) > 0: page_count_text = page_count_text[-1].attrs['href'] else: page_count_text = '_' page_count = page_count_text.split('_')[-1].strip('.htm') page_count = int(page_count) if len(page_count) > 0 else 0 logger.info('{} 一共有{}页'.format(gov_name, page_count)) for num in range(page_count): url = url_format.format(num + 1) try: response = request_site_page(url) if response is None: logger.error('网页请求错误{}'.format(url)) soup = bs(response.content if response else '', 'lxml') for x in soup.find(class_='newslist').find_all('li'): anc_url = x.find(class_='list_hbwj2').a.attrs['href'].strip() if not anc_url.startswith('http'): anc_url = urljoin(url_format.format(''), anc_url) if db[collection_name].count_documents({'url': anc_url}) != 0: return info = { 'title': x.find(class_='list_hbwj2').a.attrs['title'].strip(), 'publishDate': x.find(class_='list_hbwj3').text, 'url': anc_url, 'type': '行政处罚决定', 'origin': gov_name, 'status': 'not parsed' } logger.info('{} 新公告:{}'.format(gov_name, info['title'])) if info not in result_list: result_list.append(info) except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue if len(result_list) > 0: logger.info('{}一共有{}条新公告,导入数据库中......'.format(gov_name, len(result_list))) r = db[collection_name].insert_many(result_list) if len(r.inserted_ids) == len(result_list): logger.info('{}公告导入完成!'.format(gov_name)) else: logger.error('{}公告导入出现问题!'.format(gov_name)) else: logger.info('{}没有新公告!'.format(gov_name))
def crawler(): result_list = [] response = request_site_page(url_format.format('1')) if response is None: logger.error('网页请求错误{}'.format(url_format.format('1'))) soup = bs(response.content if response else '', 'lxml') page_count_text = soup.body.find(class_='page') pattern_result = re.findall(r'gotonum\((\d+), (\d+),', page_count_text.text if page_count_text else '') total_count, page_size = 0, 25 if pattern_result and len(pattern_result) == 1: total_count = int(pattern_result[0][0]) page_size = int(pattern_result[0][1]) page_count = math.ceil(total_count / page_size) logger.info('{} 一共有{}页'.format(gov_name, page_count)) for num in range(page_count): url = url_format.format(num + 1) try: response = request_site_page(url) if response is None: logger.error('网页请求错误{}'.format(url)) soup = bs(response.content if response else '', 'lxml') for x in soup.find_all('tr'): if x.td: anc_url = x.find('a').attrs['href'].strip() if not anc_url.startswith('http'): anc_url = url_prefix + anc_url if db[collection_name].count_documents({'url': anc_url}) != 0: return info = { 'title': x.find('a').attrs['title'].strip(), 'publishDate': x.find(class_='date').text, 'url': anc_url, 'type': '行政处罚决定', 'origin': gov_name, 'status': 'not parsed' } logger.info('{} 新公告:{}'.format(gov_name, info['title'])) if info not in result_list: result_list.append(info) except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue if len(result_list) > 0: logger.info('{}一共有{}条新公告,导入数据库中......'.format(gov_name, len(result_list))) r = db[collection_name].insert_many(result_list) if len(r.inserted_ids) == len(result_list): logger.info('{}公告导入完成!'.format(gov_name)) else: logger.error('{}公告导入出现问题!'.format(gov_name)) else: logger.info('{}没有新公告!'.format(gov_name))
def crawler(MenuID): result_list = [] param_temp = copy.deepcopy(param) param_temp['MenuID'] = MenuID response = request_site_page(url, params=param_temp) if response is None: logger.error('网页请求错误{}'.format(url)) soup = bs(response.content if response else '', 'lxml') page_count_text = soup.body.find(class_='pages') pattern_result = re.findall( r'共<font color="red">(\d+)', str(page_count_text) if page_count_text else '') page_count = int(pattern_result[-1]) if len(pattern_result) > 0 else 1 logger.info('{} 一共有{}页'.format(gov_name, page_count)) for num in range(page_count): param_temp['page'] = num + 1 try: response = request_site_page(url, params=param_temp) if response is None: logger.error('网页请求错误{}'.format(url)) soup = bs(response.content if response else '', 'lxml') for x in soup.find(class_='t_infos_c').find_all('li'): anc_url = x.a.attrs['href'].strip() if not anc_url.startswith('http'): anc_url = urljoin(url_prefix, anc_url) if db[collection_name].count_documents({'url': anc_url}) != 0: return info = { 'title': x.a.attrs['title'].strip(), 'publishDate': x.cite.text.strip(), 'url': anc_url, 'type': '行政处罚决定', 'origin': gov_name, 'status': 'not parsed' } logger.info('{} 新公告:{}'.format(gov_name, info['title'])) if info not in result_list: result_list.append(info) except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue if len(result_list) > 0: logger.info('{}一共有{}条新公告,导入数据库中......'.format(gov_name, len(result_list))) r = db[collection_name].insert_many(result_list) if len(r.inserted_ids) == len(result_list): logger.info('{}公告导入完成!'.format(gov_name)) else: logger.error('{}公告导入出现问题!'.format(gov_name)) else: logger.info('{}没有新公告!'.format(gov_name))
def crawler(): result_list = [] response = request_site_page(url_format.format('')) if response is None: logger.error('网页请求错误{}'.format(url_format.format(''))) page_count = int( re.search(r'createPageHTML\(\d+,(\d+),', response.text if response else '').group(1)) logger.info('{} 一共有{}页'.format(gov_name, page_count)) for num in range(page_count): url = url_format.format('_' + str(num) if num != 0 else '') try: response = request_site_page(url) if response is None: logger.error('网页请求错误{}'.format(url)) soup = bs(response.content if response else '', 'lxml') for i, x in enumerate(soup.find_all('tr')): if i == 0: continue anc_url = x.find('a').attrs['href'].strip() if not anc_url.startswith('http'): anc_url = urljoin(url_format.format(''), anc_url) if db[collection_name].count_documents({'url': anc_url}) != 0: return info = { 'title': x.find('a').attrs['title'].strip(), 'publishDate': x.find_all('td')[-1].text, 'url': anc_url, 'type': '行政处罚决定', 'origin': gov_name, 'status': 'not parsed' } if '行政处罚' not in info['title'] and '行政复议' not in info[ 'title'] and '投诉' not in info['title']: continue logger.info('{} 新公告:{}'.format(gov_name, info['title'])) if info not in result_list: result_list.append(info) except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue if len(result_list) > 0: logger.info('{}一共有{}条新公告,导入数据库中......'.format(gov_name, len(result_list))) r = db[collection_name].insert_many(result_list) if len(r.inserted_ids) == len(result_list): logger.info('{}公告导入完成!'.format(gov_name)) else: logger.error('{}公告导入出现问题!'.format(gov_name)) else: logger.info('{}没有新公告!'.format(gov_name))
def crawler(): result_list = [] response = request_site_page(url, params=param) if response is None: logger.error('网页请求错误{}'.format(url)) soup = bs(response.content if response else '', 'lxml') page_count_text = soup.body.find(class_='fanyestyle1085').tr.td page_count = int(re.findall(r'\d+/(\d+)', str(page_count_text) if page_count_text else '')[-1]) logger.info('{} 一共有{}页'.format(gov_name, page_count)) for num in range(page_count): param['ainfolist1085p'] = num + 1 try: response = request_site_page(url, params=param) if response is None: logger.error('网页请求错误{}'.format(url)) soup = bs(response.content if response else '', 'lxml') for i, x in enumerate(soup.body.find(class_='govinfolist1085').find_all('tr')): if i == 0: continue td_list = x.find_all('td') anc_url = td_list[1].a.attrs['href'].strip() if not anc_url.startswith('http'): anc_url = urljoin(url, anc_url) if db[collection_name].count_documents({'url': anc_url}) != 0: return info = { 'title': td_list[1].a.text.strip(), 'publishDate': td_list[2].text.strip().replace('年', '-').replace('月', '-').replace('日', ''), 'url': anc_url, 'type': '行政处罚决定', 'origin': gov_name, 'status': 'not parsed' } logger.info('{} 新公告:{}'.format(gov_name, info['title'])) if info not in result_list: result_list.append(info) except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue if len(result_list) > 0: logger.info('{}一共有{}条新公告,导入数据库中......'.format(gov_name, len(result_list))) r = db[collection_name].insert_many(result_list) if len(r.inserted_ids) == len(result_list): logger.info('{}公告导入完成!'.format(gov_name)) else: logger.error('{}公告导入出现问题!'.format(gov_name)) else: logger.info('{}没有新公告!'.format(gov_name))
def neeq_crawler_two(start_time, end_time): result_list = [] # get page count r = request_site_page('http://www.neeq.com.cn/PunishmentController/infoResult.do', params={'disclosureType': 8, 'page': 0, 'startTime': start_time, 'endTime': end_time}, methods='post') if r is None: logger.error('网页请求错误') return result_text = re.sub(r'null\(', '', r.text) result_json = json.loads(''.join(list(result_text)[:-1])) page_count = result_json[0]['pageList']['totalPages'] logger.info('股转系统 纪律处分一共有%d页' % page_count) # get crawler data for num in range(page_count): try: logger.info('第%d页' % (num + 1)) r = request_site_page('http://www.neeq.com.cn/PunishmentController/infoResult.do', params={'disclosureType': 8, 'page': num, 'startTime': start_time, 'endTime': end_time}, methods='post') if r is None: logger.error('网页请求错误') continue result_text = re.sub(r'null\(', '', r.text) result_json = json.loads(''.join(list(result_text)[:-1])) for each_announcement in result_json[0]['pageList']['content']: announcement_url = each_announcement['destFilePath'] if 'http://www.neeq.com.cn' in each_announcement[ 'destFilePath'] else 'http://www.neeq.com.cn' + each_announcement['destFilePath'] if db.neeq_data.find({'url': announcement_url}).count() == 0: logger.info('股转系统 纪律处分新公告:' + announcement_url) result_list.append({ 'title': each_announcement['announcementTitle'], 'publishDate': str(each_announcement['announcementDate']['year'] + 1900) + '-' + str( each_announcement['announcementDate']['month'] + 1) + '-' + str( each_announcement['announcementDate']['day']), 'url': announcement_url, 'type': '纪律处分', 'origin': '股转系统', 'status': 'not parsed' }) else: if config['crawler_update_type']['update_type'] == '0': break except Exception as e: logger.error(e) continue return result_list
def crawler(channelId): result_list = [] url = 'http://www.sepb.gov.cn/hb/fa/cms/shhj/list_login.jsp?channelId=' + channelId response = request_site_page(url) if response is None: logger.error('网页请求错误{}'.format(url)) soup = bs(response.content if response else '', 'lxml') page_count = int(soup.find(class_='scroll').find_all_next(class_='bold_nun')[1].text) logger.info('{} 一共有{}页'.format(gov_name, page_count)) for num in range(page_count): try: response = request_site_page(url=url, methods='post', data={'pageNo': num + 1}) if response is None: logger.error('网页请求错误{}'.format(url)) soup = bs(response.content if response else '', 'lxml') for x in soup.find(class_='ascend_ul').find_all('a'): anc_url = x.attrs['href'].strip() if not anc_url.startswith('http'): anc_url = urljoin(url_prefix, anc_url) if db[collection_name].count_documents({'url': anc_url}) != 0: return publish_date = x.span.text x.span.decompose() info = { 'title': x.text, 'publishDate': publish_date, 'url': anc_url, 'type': '行政处罚决定', 'origin': gov_name, 'status': 'not parsed' } logger.info('{} 新公告:{}'.format(gov_name, info['title'])) if info not in result_list: result_list.append(info) except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue if len(result_list) > 0: logger.info('{}一共有{}条新公告,导入数据库中......'.format(gov_name, len(result_list))) r = db[collection_name].insert_many(result_list) if len(r.inserted_ids) == len(result_list): logger.info('{}公告导入完成!'.format(gov_name)) else: logger.error('{}公告导入出现问题!'.format(gov_name)) else: logger.info('{}没有新公告!'.format(gov_name))
def crawler(): result_list = [] url = url_format.format('1') response = request_site_page(url) if response is None: logger.error('网页请求错误{}'.format(url)) page_count = 0 result = re.search(r'共(\d+)页', response.text if response else '') if result: page_count = int(result.group(1)) logger.info('{} 一共有{}页'.format(gov_name, page_count)) for num in range(page_count): try: response = request_site_page(url_format.format(num + 1)) if response is None: logger.error('网页请求错误{}'.format(url_format.format(num + 1))) soup = bs(response.content if response else '', 'lxml') for x in soup.find_all( attrs={"class": re.compile(r'^tr_main_value_')}): anc_url = x.find('a').attrs['href'].strip() if db[collection_name].count_documents({'url': anc_url}) != 0: return info = { 'title': x.find('a').attrs['title'].strip(), 'publishDate': x.find_all('td')[-1].text, 'url': anc_url, 'type': '行政处罚决定', 'origin': gov_name, 'status': 'not parsed' } logger.info('{} 新公告:{}'.format(gov_name, info['title'])) if info not in result_list: result_list.append(info) except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue if len(result_list) > 0: logger.info('{}一共有{}条新公告,导入数据库中......'.format(gov_name, len(result_list))) r = db[collection_name].insert_many(result_list) if len(r.inserted_ids) == len(result_list): logger.info('{}公告导入完成!'.format(gov_name)) else: logger.error('{}公告导入出现问题!'.format(gov_name)) else: logger.info('{}没有新公告!'.format(gov_name))
def crawler(): result_list = [] response = request_site_page(url, methods='post', params=param, data=data) if response is None: logger.error('网页请求错误{}'.format(url)) page_count = 0 result = re.search(r'<totalpage>(\d+)', response.text if response else '') if result: page_count = int(result.group(1)) logger.info('{} 一共有{}页'.format(gov_name, page_count)) for num in range(page_count): try: param_temp = copy.deepcopy(param) param_temp['startrecord'] = num * 20 + 1 param_temp['endrecord'] = (num + 1) * 20 response = request_site_page(url, methods='post', params=param_temp, data=data) if response is None: logger.error('网页请求错误{}'.format(url)) for x in re.findall(r'href=\'(.*?)\' title=\'(.*?)\'.*?\[(.*?)\]', response.text if response else ''): anc_url = 'http://www.zjepb.gov.cn' + x[0].strip() if db[collection_name].count_documents({'url': anc_url}) != 0: return info = { 'title': x[1], 'publishDate': x[2], 'url': anc_url, 'type': '行政处罚决定', 'origin': gov_name, 'status': 'not parsed' } logger.info('{} 新公告:{}'.format(gov_name, info['title'])) if info not in result_list: result_list.append(info) except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue if len(result_list) > 0: logger.info('{}一共有{}条新公告,导入数据库中......'.format(gov_name, len(result_list))) r = db[collection_name].insert_many(result_list) if len(r.inserted_ids) == len(result_list): logger.info('{}公告导入完成!'.format(gov_name)) else: logger.error('{}公告导入出现问题!'.format(gov_name)) else: logger.info('{}没有新公告!'.format(gov_name))
def get_list(): result_list = [] url = 'http://www.sepb.gov.cn/zhifa/law_enforce_list.jsp' response = request_site_page(url) if response is None: logger.error('网页请求错误{}'.format(url)) soup = bs(response.content if response else '', 'lxml') page_count = int(soup.find(class_='scroll').find_all_next(class_='bold_nun')[1].text) logger.info('{} 一共有{}页'.format(gov_name, page_count)) for num in range(page_count): try: response = request_site_page(url=url, methods='post', data={'pageNo': num + 1}) if response is None: logger.error('网页请求错误{}'.format(url)) soup = bs(response.content if response else '', 'lxml') for x in soup.find_all('a', class_='listInfo'): publish_date = re.search(r'\d+-\d+-\d+', x.attrs['onclick']).group() anc_url = 'http://www.sepb.gov.cn/zhifa/law_enforce_sublist.jsp?time=' + publish_date if db[collection_name].count_documents({'url': anc_url}) != 0: return info = { 'title': x.text, 'publishDate': publish_date, 'url': anc_url, 'type': '行政处罚决定', 'origin': gov_name, 'status': 'not parsed' } logger.info('{} 新公告:{}'.format(gov_name, info['title'])) if info not in result_list: result_list.append(info) except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue if len(result_list) > 0: logger.info('{}一共有{}条新公告,导入数据库中......'.format(gov_name, len(result_list))) r = db[collection_name].insert_many(result_list) if len(r.inserted_ids) == len(result_list): logger.info('{}公告导入完成!'.format(gov_name)) else: logger.error('{}公告导入出现问题!'.format(gov_name)) else: logger.info('{}没有新公告!'.format(gov_name))
def test( announcement_url='http://sthjt.shanxi.gov.cn/html/xzcfjd/20170227/54071.html' ): r = request_site_page(announcement_url) r.encoding = r.apparent_encoding content_soup = BeautifulSoup(r.text, 'lxml') if announcement_url == 'http://sthjt.shanxi.gov.cn/html/xzcfjd/20170614/54089.html' \ or announcement_url == 'http://sthjt.shanxi.gov.cn/html/xzcfjd/20170227/54071.html': content_text_list = content_soup.find( 'div', class_='td-con').find_all('tr')[2:] i = 0 while i < len(content_text_list): if ('季度' in content_text_list[i].text) or ( '企业名称' in content_text_list[i].text): del (content_text_list[i]) else: i = i + 1 result_map_list = [] for content_text in content_text_list: context = content_text.find_all('td') # 处罚机构 announcement_org = context[3].text # 处罚日期 # real_publish_date = format_date(each_document['publishDate'].split(' ')[0]) # 文号 announcement_code = context[4].text # 当事人 litigant = context[1].text # 违规事实 facts = '超标率: ' + context[2].text # 认定意见 punishment_basis = '' # 申辩意见 defenseOpinion = '' # 申辩意见反馈 defenseResponse = '' # 处罚决定 punishment_decision = context[5].text
def hainan_circ(db, logger): for each_circ_data in db.circ_data.find({'origin': '海南保监局', 'status': {'$nin': ['ignored']}}): announcement_url = each_circ_data['url'] announcement_title = each_circ_data['title'] if db.circ_data.find( {'url': announcement_url, 'status': 'parsed'}).count() == 1 and db.parsed_data.find( {'origin_url': announcement_url, 'oss_file_origin_url': announcement_url, 'parsed': True}).count() == 1: continue logger.info('海南保监局 ' + 'Url to parse: %s' % announcement_url) r = request_site_page(announcement_url) if r is None: logger.error('网页请求错误 %s' % announcement_url) continue content_soup = bs(r.content, 'lxml') if r else bs('', 'lxml') if db.parsed_data.find( {'origin_url': announcement_url, 'oss_file_origin_url': announcement_url}).count() == 0: oss_file_map = { 'origin_url': announcement_url, 'oss_file_origin_url': announcement_url, 'origin_url_id': each_circ_data['_id'], 'oss_file_type': 'html', 'oss_file_name': announcement_title, 'oss_file_content': r.text.encode(r.encoding).decode('utf-8'), 'parsed': False } insert_response = db.parsed_data.insert_one(oss_file_map) file_id = insert_response.inserted_id oss_add_file(ali_bucket, str(file_id) + '/' + announcement_title + '.html', r.text.encode(r.encoding).decode('utf-8')) db.circ_data.update_one({'_id': each_circ_data['_id']}, {'$set': {'status': 'parsed'}}) else: db.circ_data.update_one({'_id': each_circ_data['_id']}, {'$set': {'status': 'parsed'}}) file_id = db.parsed_data.find_one({'origin_url': announcement_url, 'oss_file_origin_url': announcement_url})['_id'] table_content = content_soup.find(id='tab_content') if not table_content: logger.error('网页请求错误 %s' % announcement_url) continue content_text = get_content_text(table_content.find_all('tr')[3]) if content_text == '': continue title = table_content.find_all('tr')[0].text.strip() document_code_compiler = re.compile(r'((琼银?保监罚(字)?|琼银保监筹).\d{4}.\d+号)') if document_code_compiler.search(content_text): document_code = document_code_compiler.search(content_text).group(1).strip() litigant_compiler = re.compile( document_code.replace(r'[', r'\[').replace(r']', r'\]') + r'\n([\s\S]*?)\n' + r'(经查|经检查|依据.*?的有关规定|' r'抽查|经抽查|.*?现场检查|现查明|' r'你公司于2007年承保的保单号为PMHU07013849|' r'你公司海口市府城镇中山路)') litigant = litigant_compiler.search(content_text).group(1).strip(). \ replace('中国保监会海南监管局行政处罚决定书', '').strip() else: if document_code_compiler.search(title): document_code = document_code_compiler.search(title).group(1).strip() else: document_code = '' litigant_compiler = re.compile(r'^([\s\S]*?)\n' + r'(经查|经检查|依据.*?的有关规定|抽查|经抽查|.*?现场检查|现查明)') litigant = litigant_compiler.search(content_text).group(1).strip() truth_text_str = r'((经查|检查发现)' \ r'([\s\S]*?))' \ r'((我局认为,)?(上述|以上).*?(事实|行为|事实)(,|,)?有.*?等(证据)?(在案)?证明(,|,|。)(足以认定。)?|' \ r'(我局认为,|综上,)?(上述|以上|你的).*?(行为|问题|事实).*?违反.*?第.*条.*?(的规定)?|' \ r'(根据|依据).*?第.*?条|依据《保险法》规定)' truth_compiler = re.compile(truth_text_str) if truth_compiler.search(content_text): truth = truth_compiler.search(content_text).group(1).strip() else: truth_text_str = litigant + r'([\s\S]*?)' \ r'((我局认为,)?(上述|以上).*?(事实|行为|事实)(,|,)?有.*?等证据(在案)?证明(,|,|。)(足以认定。)?|' \ r'(我局认为,|综上,)?(上述|以上).*?(行为|问题|事实).*?违反.*?第.*条.*?(的规定)?|' \ r'(根据|依据).*?第.*?条|依据《保险法》规定)' truth_compiler = re.compile(truth_text_str) truth = truth_compiler.search(content_text).group(1).strip() if '申辩' in content_text: defense_text_str = r'((针对.*?行为.*?申辩意见|(当事人)?[^,。,;\n]*?(未)?提出(了)?陈述申辩(意见)?|' \ r'[^,。,;\n]*?向我局(报送|递交|提出)[^,。,;\n]*?|本案在审理过程中.*?提出陈述申辩|' \ r'[^,。,;\n]*?在(申辩材料|陈述申辩|陈述申辩意见|申辩意见)中称|[^,。,;\n]*?在听证阶段提出|' \ r'[^,。,;\n]*?在法定期限内(未)?提出(了)?(听证要求|陈述申辩|陈述申辩及听证要求))' \ r'([\s\S]*?))' \ r'(因此,我局决定|' \ r'我局经复核(认为|决定)|' \ r'本案现已审理终结|' \ r'我局经复查[^,。,;\n]*?情况|' \ r'我局[^,。,;\n]*?认真复核|' \ r'经研究,对[^,。,;\n]*?予以采纳。|' \ r'我局认为.*?申辩理由|' \ r'依据.*?我局认为.*?的申辩理由|' \ r'经研究,我局认为.*?申辩意见|' \ r'经我局审核,决定|' \ r'我局认为,上述违法行为事实清楚、证据确凿、法律法规适当|' \ r'我局对陈述申辩意见进行了复核|' \ r'经我局审核|' \ r'针对[^,。,;\n]*?的(陈述)?申辩意见,我局进行了核实|' \ r'经查,我局认为|' \ r'依据现场检查及听证情况)' defense_compiler = re.compile(defense_text_str, re.MULTILINE) defense_list = defense_compiler.findall(content_text) if len(defense_list) != 0: defense = defense_list[-1][0].strip() defense_response_str = defense.replace(r'[', r'\[').replace(r']', r'\]') \ + r'(([\s\S]*?)' + r'(本案现已审理终结。|不符合.*?情形。|根据.*?依法可从轻或者减轻行政处罚。|' \ r'对[^,。,;\n]*?申辩意见(不予|予以)采纳|因此.*?申辩理由.*?成立。|' \ r'我局认为.*?申辩(理由|意见).*?符合.*?第.*?条.*?的条件.(予以采纳。)?))' defense_response_compiler = re.compile(defense_response_str, re.MULTILINE) if defense_response_compiler.search(content_text): defense_response = defense_response_compiler.search(content_text).group(1).strip() else: if '未' in defense: defense_response = '' else: defense_text_str = '([^。;\n]*?向.*?公告送达了《行政处罚事先告知书》.*?提出陈述申辩。|' \ '我局依法于2012年5月25日对你公司送达了《行政处罚事先告知书》,你公司在规定的时间内未提出陈述和申辩意见,也未要求举行听证。)' defense_compiler = re.compile(defense_text_str, re.MULTILINE) defense = defense_compiler.search(content_text).group(1).strip() defense_response = '' else: defense = defense_response = '' punishment_decision_text_str = r'(((依据|根据).*?第?.*?条.*?(规定)?.?(我局)?(决定|责令|给予|于.*?向.*?发出|对.*?作出|拟对你|对你)|' \ r'我局决定.*?作出(如下|以下)(行政)?处罚:|我局决定,依据|' \ r'你的上述行为违反了《中华人民共和国保险法》第一百七十三条规定,决定|' \ r'依据《保险法》规定,我局决定)' \ r'([\s\S]*?))' \ r'(请在本处罚决定书送达之日|当事人应当在接到本处罚决定书之日|如不服本处罚决定|' \ r'请(在)?接到本处罚决定书之日|如不服从本处罚决定|本处罚决定自送达之日|' \ r'请在接到本处罚决定书之日)' punishment_decision_compiler = re.compile(punishment_decision_text_str) punishment_decision = punishment_decision_compiler.search(content_text).group(1).strip() punishment_basis_str_list = [ r'([^\n。;]*?)(问题|行为|事项|情况|事实)([^\n。;\s]*?)违反.*?\n?.*?第.*?条?.*?\n?.*?((的|之|等)(相关)?规定)?', r'你公司于2007年承保的保单号为PMHU07013849、06020200010008062007000175、06020200030008062007000082等交强险业务,' r'没有执行统一的基础保险费率,违反了《机动车交通事故责任强制保险条例》第六条的规定', r'你公司自2007年开业至2008年2月底,没有按照规定开设独立的客户资金专用账户,而是将在农业银行海口市金盘工业区分理处开设的基本存款账户' r'(账号:21164001040002867)用于代收客户资金及与保险机构之间的资金结算,还用于支付公司税金、员工的社会保险费用等,违反了《保险经纪机构管理规定》第八十八条规定', r'经查,中国人民人寿保险股份有限公司海南省分公司2008年1-8月承保团体年金B款业务,保费收入62193335元,' r'以赠送保险方式改变了产品条款中的退保、给付条件及金额且未向我局报备,违反了《保险法》第一百零七条第二款规定。' r'\n你作为中国人民人寿保险股份有限公司海南省分公司副总经理,分管团险业务,对该违规行为负有直接责任' ] punishment_basis_str = '|'.join(punishment_basis_str_list) punishment_basis_compiler = re.compile('[。\n;]' + '(' + punishment_basis_str + ')' + '.(\n?(我局决定,|我局)?依据|\n?根据|\n?鉴于|\n?上述违法事实)', re.MULTILINE) punishment_basis_list = punishment_basis_compiler.findall(content_text) punishment_basis = ';'.join([kk[0].strip() for kk in punishment_basis_list]) publish_date_text = re.search( punishment_decision.replace(r'(', r'\(').replace(r')', r'\)').replace(r'[', r'\[').replace(r']', r'\]'). replace(r'*', r'\*') + r'([\s\S]*?)$', content_text).group(1).replace('\n', '') if re.search(r'.{4}年.{1,2}月.{1,3}日', publish_date_text): publish_date = re.findall('.{4}年.{1,2}月.{1,3}日', publish_date_text)[-1].replace(' ', '') m = re.match("([0-9零一二两三四五六七八九十〇○OOΟО]+年)?([0-9一二两三四五六七八九十]+)月?([0-9一二两三四五六七八九十]+)[号日]?", publish_date) real_publish_date = get_year(m.group(1)) + str(cn2dig(m.group(2))) + '月' + str(cn2dig(m.group(3))) + '日' else: publish_date_text = table_content.find_all('tr')[1].text publish_date = re.findall(r'\d{4}-\d{1,2}-\d{1,2}', publish_date_text)[-1] real_publish_date = publish_date.split('-')[0] + '年' + str(int(publish_date.split('-')[1])) + '月' + str( int(publish_date.split('-')[2])) + '日' result_map = { 'announcementTitle': title, 'announcementOrg': '海南银保监局', 'announcementDate': real_publish_date, 'announcementCode': document_code, 'facts': truth, 'defenseOpinion': defense, 'defenseResponse': defense_response, 'litigant': litigant[:-1] if litigant[-1] == ':' else litigant, 'punishmentBasement': punishment_basis, 'punishmentDecision': punishment_decision, 'type': '行政处罚决定', 'oss_file_id': file_id, 'status': 'not checked' } logger.info(result_map) if db.announcement.find({'announcementTitle': title, 'oss_file_id': file_id}).count() == 0: db.announcement.insert_one(result_map) logger.info('海南保监局 数据解析 ' + ' -- 数据导入完成') else: logger.info('海南保监局 数据解析 ' + ' -- 数据已经存在') db.parsed_data.update_one({'_id': file_id}, {'$set': {'parsed': True}}) logger.info('海南保监局 数据解析 ' + ' -- 修改parsed完成')
def crawler(): result_list = [] url = first_url.format('') response = request_site_page(url) response.encoding = response.apparent_encoding stop_flag = False if response is None: logger.error('网页请求错误{}'.format(url)) soup = bs(response.content if response else '', 'lxml') if db.crawler.find({'url': first_url}).count() > 0: last_updated_url = db.crawler.find_one({'url': first_url})['last_updated'] else: last_updated_url = '' page_num = 1 while response.status_code != 404: try: data_list = soup.find(attrs={"class": "hotnews_list"}).find_all('li') for index, each_data in enumerate(data_list): title = each_data.find('a').text.strip() if re.search('(双公示|行政处罚)', title): href = each_data.find('a')['href'] anc_url = urljoin(url, href) if anc_url == last_updated_url: stop_flag = True logger.info('到达上次爬取的链接') break if index == 0 and page_num == 1: if db.crawler.find({'url': first_url}).count() > 0: if db.crawler.find_one({'url': first_url})['last_updated'] != anc_url: db.crawler.update_one({'url': first_url}, {'$set': {'last_updated': anc_url}}) else: db.crawler.insert_one( {'url': first_url, 'last_updated': anc_url, 'origin': gov_name}) publish_date = each_data.find('span').text.replace('/', '') if db[collection_name].count_documents({'url': anc_url}) == 0: info = { 'title': title, 'publishDate': publish_date, 'url': anc_url, 'type': '行政处罚决定', 'origin': gov_name, 'status': 'not parsed' } logger.info('{} 新公告:{} url: {}'.format(gov_name, info['title'], anc_url)) if info not in result_list: result_list.append(info) else: if config['crawler_update_type']['update_type'] == '0': break if stop_flag: logger.info('到达上次爬取的链接') break url = first_url.format('_' + str(page_num)) page_num += 1 response = request_site_page(url) response.encoding = response.apparent_encoding soup = bs(response.content if response else '', 'lxml') except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue page_num = 1 url = second_url.format(str(page_num)) response = request_site_page(url) response.encoding = response.apparent_encoding stop_flag = False if response is None: logger.error('网页请求错误{}'.format(url)) soup = bs(response.content if response else '', 'lxml') if db.crawler.find({'url': second_url}).count() > 0: last_updated_url = db.crawler.find_one({'url': second_url})['last_updated'] else: last_updated_url = '' if not re.search(r'暂无符合公布标准的重大税收违法案件信息。', soup.text): page_count_text = soup.find_all('a')[-1]['href'] page_count = re.search('page=(\d+)', page_count_text).group(1).strip() while page_num <= int(page_count): try: data_list = soup.find(attrs={"class": "lb"}).find_all('dd') for index, each_data in enumerate(data_list): href = each_data.find('a')['href'] anc_url = urljoin(url, href) if anc_url == last_updated_url: stop_flag = True logger.info('到达上次爬取的链接') break if index == 0 and page_num == 1: if db.crawler.find({'url': second_url}).count() > 0: if db.crawler.find_one({'url': second_url})['last_updated'] != anc_url: db.crawler.update_one({'url': second_url}, {'$set': {'last_updated': anc_url}}) else: db.crawler.insert_one( {'url': second_url, 'last_updated': anc_url, 'origin': gov_name}) title = each_data.find('a').text.strip() if db[collection_name].count_documents({'url': anc_url}) == 0: info = { 'title': title, 'publishDate': '', 'url': anc_url, 'type': '行政处罚决定', 'origin': gov_name, 'status': 'not parsed' } logger.info('{} 新公告:{} url: {}'.format(gov_name, info['title'], anc_url)) if info not in result_list: result_list.append(info) else: if config['crawler_update_type']['update_type'] == '0': break if stop_flag: logger.info('到达上次爬取的链接') break page_num += 1 url = second_url.format(str(page_num)) response = request_site_page(url) response.encoding = response.apparent_encoding soup = bs(response.content if response else '', 'lxml') except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue if len(result_list) > 0: logger.info('{}一共有{}条新公告,导入数据库中......'.format(gov_name, len(result_list))) r = db[collection_name].insert_many(result_list) if len(r.inserted_ids) == len(result_list): logger.info('{}公告导入完成!'.format(gov_name)) else: logger.error('{}公告导入出现问题!'.format(gov_name)) else: logger.info('{}没有新公告!'.format(gov_name))
def scczt_crawler(): result_list = [] # 用来保存最后存入数据库的数据 prefix_url = [{ 'url': 'http://www.sccz.gov.cn/new_web/new_NewList.jsp?ntypename=%25E8%25A1%258C%25E6%2594%25BF%25E8%25AE%25B8%25E5%258F%25AF%25E8%25A1%258C%25E6%2594%25BF%25E5%25A4%2584%25E7%25BD%259A', 'origin': '四川省财政厅', 'request_url': 'http://www.sccz.gov.cn/new_web/new_NewListRight.jsp?id=87&nYear=0' }] for each_url_info in prefix_url: each_url = each_url_info['request_url'] stop_flag = False logger.info('四川省财政厅 抓取URL:' + each_url) # get page count base_page = request_site_page(each_url) if base_page is None: logger.error('网页请求错误 %s' % each_url_info['url']) continue base_page.encoding = base_page.apparent_encoding base_soup = BeautifulSoup(base_page.text, 'lxml') try: page_count_text = base_soup.find(id='page').text.strip() page_count = int(re.findall(r'\d+', page_count_text)[2]) except Exception as e: logger.warning(e) page_count = 0 logger.info('一共有%d页' % page_count) if db.crawler.find({'url': each_url}).count() > 0: last_updated_url = db.crawler.find_one({'url': each_url})['last_updated'] else: last_updated_url = '' # get crawler data for page_num in range(page_count): logger.info('第' + str(page_num + 1) + '页') url = each_url + '&page=' + str(page_num + 1) try: page_response = request_site_page(url) if page_response is None: logger.error('网页请求错误 %s' % url) continue page_response.encoding = page_response.apparent_encoding page_soup = BeautifulSoup(page_response.text, 'lxml') all_li = page_soup.find('table').find_all('tr') for index, each_result in enumerate(all_li): title = each_result.find('span').attrs['title'].strip() announcement_id = str( re.findall( r'\d+', each_result.find('td').attrs['onclick'].strip()) [0]) true_url = 'http://www.sccz.gov.cn/new_web/new_NewShow.jsp?id=' + announcement_id # 判断是否为之前抓取过的 if true_url == last_updated_url: stop_flag = True logger.info('到达上次爬取的链接') break # 更新抓取的分割线 if page_num == 0 and index == 0: if db.crawler.find({'url': each_url}).count() > 0: if db.crawler.find_one( {'url': each_url})['last_updated'] != true_url: db.crawler.update_one( {'url': each_url}, {'$set': { 'last_updated': true_url }}) else: db.crawler.insert_one({ 'url': each_url, 'last_updated': true_url, 'origin': each_url_info['origin'] }) if re.search('.*(行政处罚).*', title): publish_date = each_result.find( class_='Content_r').text.strip() if db.finance_data.find({ 'url': true_url }).count() == 0: logger.info('四川省财政厅新公告:' + true_url + ' title: ' + title) post = { 'title': title, 'publishDate': publish_date, 'url': true_url, 'type': '行政处罚决定', 'origin': '四川省财政厅', 'status': 'not parsed' } if post not in result_list: result_list.append(post) else: if config['crawler_update_type'][ 'update_type'] == '0': break if stop_flag: logger.info('到达上次爬取的链接') break except Exception as e: logger.error(e) continue if len(result_list) > 0: logger.info('四川省财政厅一共有%d条新公告,导入数据库中......' % len(result_list)) r = db.finance_data.insert_many(result_list) if len(r.inserted_ids) == len(result_list): logger.info('四川省财政厅公告导入完成!') else: logger.error('四川省财政厅公告导入出现问题!') else: logger.info('四川省财政厅没有新公告!')
def guizhou_circ(db, logger): for each_circ_data in db.circ_data.find({'origin': '贵州保监局', 'status': {'$nin': ['ignored']}}): announcement_url = each_circ_data['url'] announcement_title = each_circ_data['title'] if db.circ_data.find( {'url': announcement_url, 'status': 'parsed'}).count() == 1 and db.parsed_data.find( {'origin_url': announcement_url, 'oss_file_origin_url': announcement_url, 'parsed': True}).count() == 1: continue logger.info('贵州保监局 ' + 'Url to parse: %s' % announcement_url) r = request_site_page(announcement_url) if r is None: logger.error('网页请求错误 %s' % announcement_url) continue content_soup = bs(r.content, 'lxml') if r else bs('', 'lxml') if db.parsed_data.find( {'origin_url': announcement_url, 'oss_file_origin_url': announcement_url}).count() == 0: oss_file_map = { 'origin_url': announcement_url, 'oss_file_origin_url': announcement_url, 'origin_url_id': each_circ_data['_id'], 'oss_file_type': 'html', 'oss_file_name': announcement_title, 'oss_file_content': r.text.encode(r.encoding).decode('utf-8'), 'parsed': False } insert_response = db.parsed_data.insert_one(oss_file_map) file_id = insert_response.inserted_id oss_add_file(ali_bucket, str(file_id) + '/' + announcement_title + '.html', r.text.encode(r.encoding).decode('utf-8')) db.circ_data.update_one({'_id': each_circ_data['_id']}, {'$set': {'status': 'parsed'}}) else: db.circ_data.update_one({'_id': each_circ_data['_id']}, {'$set': {'status': 'parsed'}}) file_id = db.parsed_data.find_one({'origin_url': announcement_url, 'oss_file_origin_url': announcement_url})['_id'] table_content = content_soup.find(id='tab_content') if not table_content: logger.error('网页请求错误 %s' % announcement_url) continue content_text = get_content_text(table_content.find_all('tr')[3]) if content_text == '': continue title = table_content.find_all('tr')[0].text.strip() if '行政处罚事项' in title: sub_table_content = table_content.find_all('tr')[3].find_all('table')[0] result_map_list = [] for tr in sub_table_content.find_all('tr'): if '行政处罚公开信息' not in tr.text and '行政处罚事项' not in tr.text and '处罚决定文号' not in tr.text and \ len(tr.find_all('td')) > 1 and tr.find_all('td')[0].text != tr.find_all('td')[1].text: real_title = '贵州银保监局行政处罚决定书(' + tr.find_all('td')[0].text + ')' real_publish_date = tr.find_all('td')[2].text.split('-')[0] + '年' + \ tr.find_all('td')[2].text.split('-')[1] + '月' + \ tr.find_all('td')[2].text.split('-')[2] + '日' result_map = { 'announcementTitle': real_title, 'announcementOrg': '贵州银保监局', 'announcementDate': real_publish_date, 'announcementCode': tr.find_all('td')[0].text, 'facts': tr.find_all('td')[1].text + tr.find_all('td')[6].text, 'defenseOpinion': '', 'defenseResponse': '', 'litigant': tr.find_all('td')[1].text, 'punishmentBasement': tr.find_all('td')[1].text + '上述行为违反了' + tr.find_all('td')[4].text, 'punishmentDecision': '依据' + tr.find_all('td')[5].text + ',' + '我局对' + tr.find_all('td')[ 1].text + '作出以下处罚:' + tr.find_all('td')[3].text, 'type': '行政处罚决定', 'oss_file_id': file_id, 'status': 'not checked' } logger.info(result_map) if db.announcement.find({'announcementTitle': real_title, 'oss_file_id': file_id}).count() == 0: db.announcement.insert_one(result_map) logger.info('贵州保监局 数据解析 ' + ' -- 数据导入完成') else: logger.info('贵州保监局 数据解析 ' + ' -- 数据已经存在') result_map_list.append(result_map) if len(result_map_list) > 0: db.parsed_data.update_one({'_id': file_id}, {'$set': {'parsed': True}}) logger.info('贵州保监局 数据解析 ' + ' -- 修改parsed完成') else: logger.info('贵州保监局 数据解析 ' + ' -- 没有数据') else: document_code_compiler = re.compile(r'(黔保监罚字.\d{4}.\d+.*?号)') if document_code_compiler.search(content_text): document_code = document_code_compiler.search(content_text).group(1).strip() litigant_compiler = re.compile( document_code.replace(r'[', r'\[').replace(r']', r'\]') + r'\n([\s\S]*?)\n' + r'(经查|经检查|依据.*?的有关规定|抽查|经抽查|' r'.*?存在.*?行为|.*进行检查|我局.*?检查时|' r'.*?经营活动|经核查|.*?进行了核查|' r'.*?担任.*?期间|' r'.*?未经我局批准,擅自|' r'你公司未经保险监管部门批准)') litigant = litigant_compiler.search(content_text).group(1).strip() else: if document_code_compiler.search(title): document_code = document_code_compiler.search(title).group(1).strip() litigant_compiler = re.compile(r'^([\s\S]*?)\n' + r'(经查|经检查|依据.*?的有关规定|抽查|经抽查|.*?存在.*?行为|' r'.*进行检查|我局.*?检查时|.*?担任.*?期间|你公司未经保险监管部门批准|' r'.*?经营活动|经核查|.*?进行了核查|.*?未经我局批准,擅自)') litigant = litigant_compiler.search(content_text).group(1).strip() else: document_code = '' litigant_compiler = re.compile(r'行政处罚信息.(.*?).$') litigant = litigant_compiler.search(title).group(1).strip() truth_text_str = r'((经查|二、|三、|经核查)' \ r'([\s\S]*?))' \ r'((我局认为,)?(上述|以上).*?(事实|行为|问题)(,|,)?有.*?等证据(在案)?证明(,|,|。)(足以认定。)?|' \ r'(我局认为,|综上,)?(上述|以上).*?(行为|问题|事实).*?违反.*?第.*条.*?(的规定)?|' \ r'违反|依据|上述行为违反了)' truth_compiler = re.compile(truth_text_str) truth_list = truth_compiler.findall(content_text) if len(truth_list) > 0: truth = '\n'.join([each_truth[0] for each_truth in truth_list]) else: truth_text_str = litigant.replace(r'*', r'\*') + \ r'([\s\S]*?)' \ r'((我局认为,)?(上述|以上).*?(事实|行为|事实)(,|,)?有.*?等证据(在案)?证明(,|,|。)(足以认定。)?|' \ r'(我局认为,|综上,)?(上述|以上).*?(行为|问题|事实).*?违反.*?第.*条.*?(的规定)?|' \ r'上述行为违反了|依据《中华人民共和国保险法》第一百零九条、第一百三十七条、第一百四十七条的规定|' \ r'依据《保险法》第一百四十四条的规定)' truth_compiler = re.compile(truth_text_str) truth = truth_compiler.search(content_text).group(1).strip() if '申辩' in content_text: defense_text_str = r'((针对.*?行为.*?申辩意见|(当事人)?[^,。,;\n]*?(未)?提出(了)?陈述申辩(意见)?|' \ r'[^,。,;\n]*?向我局(报送|递交|提出)[^,。,;\n]*?|本案在审理过程中.*?提出陈述申辩|' \ r'[^,。,;\n]*?在(申辩材料|陈述申辩|陈述申辩意见|申辩意见)中称|[^,。,;\n]*?在听证阶段提出|' \ r'[^,。,;\n]*?在法定期限内(未)?提出(了)?(听证要求|陈述申辩|陈述申辩及听证要求))' \ r'([\s\S]*?))' \ r'(因此,我局决定|' \ r'我局经复核(认为|决定)|' \ r'本案现已审理终结|' \ r'我局经复查[^,。,;\n]*?情况|' \ r'我局[^,。,;\n]*?认真复核|' \ r'经研究,对[^,。,;\n]*?予以采纳。|' \ r'我局认为.*?申辩理由|' \ r'依据.*?我局认为.*?的申辩理由|' \ r'经研究,我局认为.*?申辩意见|' \ r'经我局审核,决定|' \ r'我局认为,上述违法行为事实清楚、证据确凿、法律法规适当|' \ r'我局对陈述申辩意见进行了复核|' \ r'经我局审核|' \ r'针对[^,。,;\n]*?的(陈述)?申辩意见,我局进行了核实|' \ r'经查,我局认为|' \ r'依据现场检查及听证情况|' \ r'对上述陈述申辩意见,我局认为)' defense_compiler = re.compile(defense_text_str, re.MULTILINE) defense_list = defense_compiler.findall(content_text) if len(defense_list) != 0: defense = defense_list[-1][0] defense_response_str = defense.replace(r'[', r'\[').replace(r']', r'\]') \ + r'(([\s\S]*?)' + r'(本案现已审理终结。|不符合.*?情形。|根据.*?依法可从轻或者减轻行政处罚。|' \ r'对[^,。,;\n]*?申辩意见(不予|予以)采纳|因此.*?申辩理由.*?成立。|' \ r'我局认为.*?申辩(理由|意见).*?符合.*?第.*?条.*?的条件.(予以采纳。)?))' defense_response_compiler = re.compile(defense_response_str, re.MULTILINE) if defense_response_compiler.search(content_text): defense_response = defense_response_compiler.search(content_text).group(1).strip() else: if '未' in defense: defense_response = '' else: defense_text_str = r'(你公司在陈述申辩意见中未对违法事实、处罚理由及依据提出异议。|' \ r'你公司在规定期限内未提出陈述申辩意见。|' \ r'你机构在陈述申辩意见中未对违法事实、处罚理由及依据提出异议。|' \ r'你公司及陆忠豪在规定期限内未提出陈述申辩意见。|' \ r'你在规定期限内未提出陈述申辩意见。|' \ r'[^,。,;\n]*?在规定期限内未提出陈述申辩意见。|' \ r'你在规定期限内未要求听证,也未提出陈述申辩意见。|' \ r'可在收到本告知书之日起10日内向我局提交书面的陈述书和申辩书。逾期视为放弃陈述和申辩。)' defense_compiler = re.compile(defense_text_str, re.MULTILINE) defense = defense_compiler.search(content_text).group(1).strip() defense_response = '' else: defense = defense_response = '' punishment_decision_text_str = r'(((依据|根据|依照)[^。\n]*?第[^。\n]*?条[^。\n]*?(规定)?.?' \ r'(我局)?(决定|责令|给予|于.*?向.*?发出|对.*?作出|拟对你)|' \ r'我局经复核认为|我局(决定)?.*?作出(如下|以下)(行政)?(处罚)?处罚)' \ r'([\s\S]*?))' \ r'(请在本处罚决定书送达之日|当事人应当在接到本处罚决定书之日|如不服本处罚决定|' \ r'请(在)?接到本处罚决定书之日|如不服从本处罚决定|$|当事人应当在本处罚决定书送达之日|' \ r'请你在本处罚决定书送达之日|请.*?在接到本处罚决定书之日|' \ r'请.*?在本处罚决定书送达之日|请在接到本处罚决定之日|如你支公司对我局认定的违法事实)' punishment_decision_compiler = re.compile(punishment_decision_text_str) punishment_decision_list = punishment_decision_compiler.findall(content_text) punishment_decision = '\n'.join( [each_punishment_decision[0] for each_punishment_decision in punishment_decision_list]) punishment_basis_str_list = [ r'([^\n。;]*?)(问题|行为|事项|情况|事实)[^。;]*?违反[^。;]*?第.*?条.*?((的|之|等)(相关)?规定)?', r'华安财险贵州分公司聘任不具有任职资格的人员违反了《保险法》八十一条第一款、第二款和《任职资格规定》第四条第(三)项、第(五)项的规定', r'人保财险印江支公司强制投保人订立商业保险合同违反了《交强险条例》第十三条第二款', r'上述行为违反了《中华人民共和国保险法》第一百二十二条和第一百三十四条的规定', r'上述行为违反了《中华人民共和国保险法》八十六条', r'经查,你公司于2006年7月1日至9月30日期间,通过弄虚作假套取现金支付非法代理及相关人员手续费24.9万元,违反了《中华人民共和国保险法》第一百二十二条和第一百三十四条的规定', r'你支公司于2005年11月15日,未经我局批准,擅自将支公司及下辖南北大街营销服务部分别由我局批设地址镇宁县城关李家井8号、' r'镇宁县南北大街黄果树商城内迁至镇宁县南北大街中段中国农业发展银行镇宁县支行办公大楼二楼,违反了《保险法》第八十二条的规定' ] punishment_basis_str = '|'.join(punishment_basis_str_list) punishment_basis_compiler = re.compile('(。|\n|;|^)' + '(' + punishment_basis_str + ')' + '.(\n?依据|\n?根据|\n?鉴于|\n?依照)', re.MULTILINE) punishment_basis_list = punishment_basis_compiler.findall(content_text) punishment_basis = ';'.join([kk[1].strip() for kk in punishment_basis_list]) publish_date_text = re.search( punishment_decision.replace(r'(', r'\(').replace(r')', r'\)').replace(r'[', r'\[').replace(r']', r'\]'). replace(r'*', r'\*') + r'([\s\S]*?)$', content_text).group(1).replace('\n', '') if re.search(r'.{4}年.{1,2}月.{1,3}日', publish_date_text): publish_date = re.findall('.{4}年.{1,2}月.{1,3}日', publish_date_text)[-1].replace(' ', '') m = re.match("([0-9零一二两三四五六七八九十〇○OOΟО]+年)?([0-9一二两三四五六七八九十]+)月?([0-9一二两三四五六七八九十]+)[号日]?", publish_date) real_publish_date = get_year(m.group(1)) + str(cn2dig(m.group(2))) + '月' + str( cn2dig(m.group(3))) + '日' else: publish_date_text = table_content.find_all('tr')[1].text publish_date = re.findall(r'\d{4}-\d{1,2}-\d{1,2}', publish_date_text)[-1] real_publish_date = publish_date.split('-')[0] + '年' + str( int(publish_date.split('-')[1])) + '月' + str( int(publish_date.split('-')[2])) + '日' result_map = { 'announcementTitle': title, 'announcementOrg': '贵州银保监局', 'announcementDate': real_publish_date, 'announcementCode': document_code, 'facts': truth, 'defenseOpinion': defense, 'defenseResponse': defense_response, 'litigant': litigant[:-1] if litigant[-1] == ':' else litigant, 'punishmentBasement': punishment_basis, 'punishmentDecision': punishment_decision, 'type': '行政处罚决定', 'oss_file_id': file_id, 'status': 'not checked' } logger.info(result_map) if db.announcement.find({'announcementTitle': title, 'oss_file_id': file_id}).count() == 0: db.announcement.insert_one(result_map) logger.info('贵州保监局 数据解析 ' + ' -- 数据导入完成') else: logger.info('贵州保监局 数据解析 ' + ' -- 数据已经存在') db.parsed_data.update_one({'_id': file_id}, {'$set': {'parsed': True}}) logger.info('贵州保监局 数据解析 ' + ' -- 修改parsed完成')
def liaoning_circ(db, logger): for each_circ_data in db.circ_data.find({ 'origin': '辽宁保监局', 'status': { '$nin': ['ignored'] } }): announcement_url = each_circ_data['url'] announcement_title = each_circ_data['title'] if db.circ_data.find({ 'url': announcement_url, 'status': 'parsed' }).count() == 1 and db.parsed_data.find({ 'origin_url': announcement_url, 'oss_file_origin_url': announcement_url, 'parsed': True }).count() == 1: continue logger.info('辽宁保监局 ' + 'Url to parse: %s' % announcement_url) r = request_site_page(announcement_url) if r is None: logger.error('网页请求错误 %s' % announcement_url) continue content_soup = bs(r.content, 'lxml') if r else bs('', 'lxml') if db.parsed_data.find({ 'origin_url': announcement_url, 'oss_file_origin_url': announcement_url }).count() == 0: oss_file_map = { 'origin_url': announcement_url, 'oss_file_origin_url': announcement_url, 'origin_url_id': each_circ_data['_id'], 'oss_file_type': 'html', 'oss_file_name': announcement_title, 'oss_file_content': r.text.encode(r.encoding).decode('utf-8'), 'parsed': False } insert_response = db.parsed_data.insert_one(oss_file_map) file_id = insert_response.inserted_id oss_add_file(ali_bucket, str(file_id) + '/' + announcement_title + '.html', r.text.encode(r.encoding).decode('utf-8')) db.circ_data.update_one({'_id': each_circ_data['_id']}, {'$set': { 'status': 'parsed' }}) else: db.circ_data.update_one({'_id': each_circ_data['_id']}, {'$set': { 'status': 'parsed' }}) file_id = db.parsed_data.find_one({ 'origin_url': announcement_url, 'oss_file_origin_url': announcement_url })['_id'] table_content = content_soup.find(id='tab_content') if not table_content: logger.error('网页请求错误 %s' % announcement_url) continue content_text = get_content_text(table_content.find_all('tr')[3]) if content_text == '': continue title = table_content.find_all('tr')[0].text.strip() document_code_compiler = re.compile( r'(辽保监[罚告].\d{4}.\d+号|辽保监[罚告]\d{4}.\d+.号)') if document_code_compiler.search(content_text): document_code = document_code_compiler.search(content_text).group( 1).strip() litigant_compiler = re.compile( document_code.replace(r'[', r'\[').replace(r']', r'\]') + r'\n([\s\S]*?)\n' + r'(经查|经检查|依据.*?的有关规定|抽查|经抽查|' r'个险部内勤经查|' r'.*?公司在|你公司上报的监管报表显示|' r'.*?(期间|经营过程中|业务经营中).*?存在.*?(行为|问题)|' r'我局调查组于.*?对.*?进行调查,发现|' r'你公司在2007年1-12月业务经营和管理中)') litigant = litigant_compiler.search(content_text).group(1).strip() else: if document_code_compiler.search(title): document_code = document_code_compiler.search(title).group( 1).strip() else: document_code = '' litigant_compiler = re.compile( r'^([\s\S]*?)\n' + r'(经查|经检查|依据.*?的有关规定|抽查|经抽查|' r'个险部内勤经查|' r'.*?公司在|你公司上报的监管报表显示|' r'.*?(期间|经营过程中|业务经营中).*?存在.*?(行为|问题)|' r'中国人民人寿保险股份有限公司本溪营销服务部在2009年业务经营过程中)') litigant = litigant_compiler.search(content_text).group(1).strip() truth_text_str = r'((经查|经检查|检查发现|现场检查,发现|抽查|经抽查|你公司在2007年1-12月业务经营和管理中)' \ r'([\s\S]*?))' \ r'((我局认为,)?(上述|以上)\n?.*?\n?(事实|行为|事实)(,|,)?\n?有[\s\S]*?等\n?证\n?据(在案)?证明(,|,|。)(足以认定。)?|' \ r'(我局认为,|综上,)?(上述|以上).*?(行为|问题|事实).*?违反.*?第.*条.*?(的规定)?|' \ r'上述事实行为违反了|' \ r'依据)' truth_compiler = re.compile(truth_text_str) if truth_compiler.search(content_text): truth = truth_compiler.search(content_text).group(1).strip() else: truth_text_str = litigant + r'([\s\S]*?)' \ r'((我局认为,)?(上述|以上)\n?.*?\n?(事实|行为|事实)(,|,)?有.*?等\n?证据(在案)?证明(,|,|。)(足以认定。)?|' \ r'(我局认为,|综上,)?(上述|以上).*?(行为|问题|事实).*?违反.*?第.*条.*?(的规定)?|' \ r'上述违法违规事实)' truth_compiler = re.compile(truth_text_str) truth = truth_compiler.search(content_text).group(1).strip() if '申辩' in content_text: defense_text_str = r'((针对.*?行为.*?申辩意见|(当事人)?[^,。,;\n]*?(未)?提出(了)?陈述申辩(意见)?|' \ r'[^,。,;\n]*?向我局(报送|递交|提出)[^,。,;\n]*?|本案在审理过程中.*?提出陈述申辩|' \ r'[^,。,;\n]*?在(申辩材料|陈述申辩|陈述申辩意见|申辩意见)中称|[^,。,;\n]*?在听证阶段提出|' \ r'[^,。,;\n]*?在法定期限内(未)?提出(了)?(听证要求|陈述申辩|陈述申辩及听证要求)|' \ r'当事人在法定期限内未提出陈述申辩意见,也未提出听证申请|' \ r'当事人.*?提出了陈述、申辩|你在申辩中称|' \ r'我局向.*?送达《行政处罚事先告知书》后.*?(申辩|听证))' \ r'([\s\S]*?))' \ r'(因此,我局决定|' \ r'我局经复核(认为|决定)|' \ r'本案现已审理终结|' \ r'我局经复查[^,。,;\n]*?情况|' \ r'我局[^,。,;\n]*?认真复核|' \ r'经研究,对[^,。,;\n]*?予以采纳。|' \ r'我局认为.*?申辩理由|' \ r'依据.*?我局认为.*?的申辩理由|' \ r'经研究,我局认为.*?申辩意见|' \ r'经我局审核,决定|' \ r'我局认为,上述违法行为事实清楚、证据确凿、法律法规适当|' \ r'我局对.*?(陈述申辩意见|听证意见)进行了.*?复核|' \ r'经我局审核|' \ r'针对[^,。,;\n]*?的(陈述)?申辩意见,我局进行了核实|' \ r'经查,我局认为|' \ r'依据现场检查及听证情况|' \ r'案件现已审理终结。|' \ r'我局认为)' defense_compiler = re.compile(defense_text_str, re.MULTILINE) defense_list = defense_compiler.findall(content_text) if len(defense_list) != 0: defense = defense_list[-1][0].strip() defense_response_str = defense.replace(r'[', r'\[').replace(r']', r'\]') \ + r'(([\s\S]*?)' + r'(本案现已审理终结。|案件现已审理终结。|不符合.*?情形。|根据.*?依法可从轻或者减轻行政处罚。|' \ r'对[^,。,;\n]*?(申辩(意见|理由)|请求)(不予|予以)采纳|因此.*?申辩.*?成立。|' \ r'我局认为.*?申辩(理由|意见).*?符合.*?第.*?条.*?的条件.(予以采纳。)?))' defense_response_compiler = re.compile(defense_response_str, re.MULTILINE) if defense_response_compiler.search(content_text): defense_response = defense_response_compiler.search( content_text).group(1).strip() else: if '未' in defense: defense_response = '' else: defense_text_str = r'([^。;\n]*?向.*?公告送达了《行政处罚事先告知书》.*?提出陈述申辩。|' \ r'我局依法于2012年5月25日对你公司送达了《行政处罚事先告知书》,你公司在规定的时间内未提出陈述和申辩意见,也未要求举行听证。|' \ r'根据《中华人民共和国行政处罚法》第三十一条、第三十二条、《中国保险监督管理委员会行政处罚程序规定》第五条和第四十三条的规定,可在收到本告知书之日起10日内到辽宁保监局(地址:沈阳市沈河区北京街29号)进行陈述和申辩。逾期视为放弃陈述权和申辩权。)' defense_compiler = re.compile(defense_text_str, re.MULTILINE) defense = defense_compiler.search(content_text).group( 1).strip() defense_response = '' else: defense = defense_response = '' punishment_decision_text_str = r'(((依据|根据|按照)[^。;]*?第?[^。;]*?条[^。;]*?(规定)?\n?.?(我局)?(决定|责令|给予)|' \ r'我局(决定)?.*?作出(如下|以下)(行政)?处罚|综上,我局作出如下处罚|' \ r'依\n?据[^。;]*?规定(,我局决定|对你公司|,决定给予)|' \ r'依据上述法律法规,鉴于你公司对发现的问题较为重视,违规情节较轻,同时在事后采取了积极的补救措施,未造成严重的后果,决定给予|' \ r'综上,我局作出如下处罚)' \ r'([\s\S]*?))' \ r'(请在本处罚决定书送达之日|当事人应当在接到本处罚决定书之日|如不服本处罚决定|' \ r'请(在)?接到本处罚决定书之日|如不服从本处罚决定|当事人应当在接到本处罚决定之日|' \ r'当事人应当在接到处罚决定书之日|(当事人)?如对(本)?处罚决定不服|' \ r'请当事人在接到本处罚决定书之日|你.*?应当在接到本处罚决定书之日|' \ r'如你公司对我局认定的违法事实、处罚理由及依据有异议)' punishment_decision_compiler = re.compile(punishment_decision_text_str) punishment_decision = punishment_decision_compiler.search( content_text).group(1).strip() punishment_basis_str_list = [ r'([^\n。;,]*?)(问题|行为|事项|情况|事实)([^\n。;\s]*?)违反([\s\S]*?)', ] punishment_basis_str = '|'.join(punishment_basis_str_list) punishment_basis_compiler = re.compile( r'[。\n;,]' + '(' + punishment_basis_str + ')' + '.(\n?依\n?据|\n?根据|\n?鉴于)', re.MULTILINE) punishment_basis_list = punishment_basis_compiler.findall(content_text) punishment_basis = ';'.join( [kk[0].strip() for kk in punishment_basis_list]) publish_date_text = re.search( punishment_decision.replace(r'(', r'\(').replace( r')', r'\)').replace(r'[', r'\[').replace(r']', r'\]').replace( r'*', r'\*') + r'([\s\S]*?)$', content_text).group(1).replace('\n', '') if re.search(r'.{4}年.{1,2}月.{1,3}日', publish_date_text): publish_date = re.findall('.{4}年.{1,2}月.{1,3}日', publish_date_text)[-1].replace(' ', '') m = re.match( "([0-9零一二两三四五六七八九十〇○OOΟО]+年)?([0-9一二两三四五六七八九十]+)月?([0-9一二两三四五六七八九十]+)[号日]?", publish_date) real_publish_date = get_year(m.group(1)) + str(cn2dig( m.group(2))) + '月' + str(cn2dig(m.group(3))) + '日' else: publish_date_text = table_content.find_all('tr')[1].text publish_date = re.findall(r'\d{4}-\d{1,2}-\d{1,2}', publish_date_text)[-1] real_publish_date = publish_date.split('-')[0] + '年' + str( int(publish_date.split('-')[1])) + '月' + str( int(publish_date.split('-')[2])) + '日' result_map = { 'announcementTitle': title, 'announcementOrg': '辽宁银保监局', 'announcementDate': real_publish_date, 'announcementCode': document_code, 'facts': truth, 'defenseOpinion': defense, 'defenseResponse': defense_response, 'litigant': litigant[:-1] if litigant[-1] == ':' else litigant, 'punishmentBasement': punishment_basis, 'punishmentDecision': punishment_decision, 'type': '行政处罚决定', 'oss_file_id': file_id, 'status': 'not checked' } logger.info(result_map) if db.announcement.find({ 'announcementTitle': title, 'oss_file_id': file_id }).count() == 0: db.announcement.insert_one(result_map) logger.info('辽宁保监局 数据解析 ' + ' -- 数据导入完成') else: logger.info('辽宁保监局 数据解析 ' + ' -- 数据已经存在') db.parsed_data.update_one({'_id': file_id}, {'$set': {'parsed': True}}) logger.info('辽宁保监局 数据解析 ' + ' -- 修改parsed完成')
def shantou_circ(db, logger): for each_circ_data in db.circ_data.find({'origin': '汕头保监局'}): announcement_url = each_circ_data['url'] announcement_title = each_circ_data['title'] if db.circ_data.find( {'url': announcement_url, 'status': 'parsed'}).count() == 1 and db.parsed_data.find( {'origin_url': announcement_url, 'oss_file_origin_url': announcement_url, 'parsed': True}).count() == 1: continue logger.info('汕头保监局 ' + 'Url to parse: %s' % announcement_url) r = request_site_page(announcement_url) if r is None: logger.error('网页请求错误 %s' % announcement_url) continue content_soup = bs(r.content, 'lxml') if r else bs('', 'lxml') if db.parsed_data.find( {'origin_url': announcement_url, 'oss_file_origin_url': announcement_url}).count() == 0: oss_file_map = { 'origin_url': announcement_url, 'oss_file_origin_url': announcement_url, 'origin_url_id': each_circ_data['_id'], 'oss_file_type': 'html', 'oss_file_name': announcement_title, 'oss_file_content': r.text.encode(r.encoding).decode('utf-8'), 'parsed': False } insert_response = db.parsed_data.insert_one(oss_file_map) file_id = insert_response.inserted_id oss_add_file(ali_bucket, str(file_id) + '/' + announcement_title + '.html', r.text.encode(r.encoding).decode('utf-8')) db.circ_data.update_one({'_id': each_circ_data['_id']}, {'$set': {'status': 'parsed'}}) else: db.circ_data.update_one({'_id': each_circ_data['_id']}, {'$set': {'status': 'parsed'}}) file_id = db.parsed_data.find_one({'origin_url': announcement_url, 'oss_file_origin_url': announcement_url})['_id'] table_content = content_soup.find(id='tab_content') if not table_content: logger.error('网页请求错误 %s' % announcement_url) continue content_text = get_content_text(table_content.find_all('tr')[3]) if content_text == '': continue title = table_content.find_all('tr')[0].text.strip() if '行政处罚事项' in title: continue else: document_code_compiler = re.compile(r'(汕银?保监罚.\d{4}.\d+号)') if document_code_compiler.search(content_text): document_code = document_code_compiler.search(content_text).group(1).strip() litigant_compiler = re.compile( document_code.replace(r'[', r'\[').replace(r']', r'\]') + r'\n([\s\S]*?)\n' + r'(经查|经检查|依据.*?有关规定|抽查' r'|经抽查|一、经查|一、 经查|' r'.*?我分局检查组对.*?进行了现场检查)') litigant = litigant_compiler.search(content_text).group(1).strip() else: if document_code_compiler.search(title): document_code = document_code_compiler.search(title).group(1).strip() else: document_code = '' litigant_compiler = re.compile(r'^([\s\S]*?)\n' + r'(经查|经检查|依据.*?有关规定|抽查|经抽查|一、经查|' r'.*?我分局检查组对.*?进行了现场检查|一、 经查)') litigant = litigant_compiler.search(content_text).group(1).strip() truth_text_str = r'(经查,|经查,|经检查,|经查实,|检查发现,|现场检查,发现|抽查,|经抽查,|经查.*?存在以下问题:)' \ r'([\s\S]*?)' \ r'((我局认为,)?(上述|以上).*?(事实|行为|事实)(,|,)?有.*?等.*?证明(,|,|。)(足以认定。)?|' \ r'(我局认为,|综上,)?(上述|以上).*?(行为|问题|事实).*?违反.*?第.*条.*?(的规定)?|' \ r'.*?的行为.*?违反了.*?第.*?条的规定|' \ r'上述情况违反了.*?第.*?条)' truth_compiler = re.compile(truth_text_str) if truth_compiler.search(content_text): truth = truth_compiler.search(content_text).group(2).strip() else: truth_text_str = litigant + r'([\s\S]*?)' \ r'((我局认为,)?(上述|以上).*?(事实|行为|事实)(,|,)?有.*?等证据(在案)?证明(,|,|。)(足以认定。)?|' \ r'(我局认为,|综上,)?(上述|以上).*?(行为|问题|事实).*?违反.*?第.*条.*?(的规定)?)' truth_compiler = re.compile(truth_text_str) truth = truth_compiler.search(content_text).group(1).strip() if '申辩' in content_text: defense_text_str = r'((针对.*?行为.*?申辩意见|(当事人)?[^,。,;\n]*?(未)?提出(了)?陈述申辩(意见)?|' \ r'[^,。,;\n]*?向我局(报送|递交|提出)[^,。,;\n]*?|本案在审理过程中.*?提出陈述申辩|' \ r'[^,。,;\n]*?在(申辩材料|陈述申辩|陈述申辩意见|申辩意见)中称|[^,。,;\n]*?在听证阶段提出|' \ r'[^,。,;\n]*?在法定期限内(未)?提出(了)?(听证要求|陈述申辩|陈述申辩及听证要求)|' \ r'[^,。,;\n]*?向我分局提出陈述和申辩|你分公司在陈述申辩中称)' \ r'([\s\S]*?))' \ r'(经核查|经我分局核查)' defense_compiler = re.compile(defense_text_str, re.MULTILINE) defense_list = defense_compiler.findall(content_text) if len(defense_list) != 0: defense = defense_list[-1][0].strip() defense_response_str = defense.replace(r'[', r'\[').replace(r']', r'\]') \ + r'(([\s\S]*?)' + r'(本案现已审理终结。|不符合.*?情形。|根据.*?依法可从轻或者减轻行政处罚。|' \ r'对[^,。,;\n]*?申辩意见(不予|予以)采纳|(因此)?.*?申辩理由.*?成立。|' \ r'我局认为.*?申辩(理由|意见).*?符合.*?第.*?条.*?的条件.(予以采纳。)?))' defense_response_compiler = re.compile(defense_response_str, re.MULTILINE) if defense_response_compiler.search(content_text): defense_response = defense_response_compiler.search(content_text).group(1).strip() else: if '未' in defense: defense_response = '' else: defense_text_str = '(我分局于.*?向你.*?送达.*?《行政处罚事先告知书》,(截至|截止)陈述申辩期满,我分局未收到你.*?的陈述和申辩。)' defense_compiler = re.compile(defense_text_str, re.MULTILINE) defense = defense_compiler.search(content_text).group(1).strip() defense_response = '' else: defense = defense_response = '' punishment_decision_text_str = r'(((依据|根据).*?第?.*?条.*?(规定)?.?.*?(决定|责令|给予|于.*?向.*?发出|对.*?作出|拟对你)|' \ r'我局经复核认为|我局决定.*?作出(如下|以下)(行政)?处罚:|' \ r'根据《中华人民共和国保险法》第一百六十二的规定,我局责令)' \ r'([\s\S]*?))' \ r'(请在本处罚决定书送达之日|当事人应当在接到本处罚决定书之日|如不服本处罚决定|' \ r'请(在)?接到本处罚决定书之日|如不服从本处罚决定|请当事人在接到本处罚决定书之日|' \ r'当事人如不服本处罚决定|自接到本处罚决定书之日起)' punishment_decision_compiler = re.compile(punishment_decision_text_str) punishment_decision = punishment_decision_compiler.search(content_text).group(1).strip() punishment_basis_str_list = [ r'([^\n。;、]*?)(问题|行为|事项|情况|事实)([^\n。;\s]*?)违反.*?\n?.*?第.*?条?\n?.*?((的|之|等)(相关)?规定)?', ] punishment_basis_str = '|'.join(punishment_basis_str_list) punishment_basis_compiler = re.compile(r'[。\n;、]' + '(' + punishment_basis_str + ')' + '.(\n?(应当)?依据|\n?根据|\n?鉴于|\n?我分局于)', re.MULTILINE) punishment_basis_list = punishment_basis_compiler.findall(content_text) punishment_basis = ';'.join([kk[0].strip() for kk in punishment_basis_list]) publish_date_text = re.search( punishment_decision.replace(r'(', r'\(').replace(r')', r'\)').replace(r'[', r'\[').replace(r']', r'\]'). replace(r'*', r'\*') + r'([\s\S]*?)$', content_text).group(1).replace('\n', '') if re.search(r'.{4}年.{1,2}月.{1,3}日', publish_date_text): publish_date = re.findall('.{4}年.{1,2}月.{1,3}日', publish_date_text)[-1].replace(' ', '') m = re.match("([0-9零一二两三四五六七八九十〇○OOΟ]+年)?([0-9一二两三四五六七八九十]+)月?([0-9一二两三四五六七八九十]+)[号日]?", publish_date) real_publish_date = get_year(m.group(1)) + str(cn2dig(m.group(2))) + '月' + str(cn2dig(m.group(3))) + '日' else: publish_date_text = table_content.find_all('tr')[1].text publish_date = re.findall(r'\d{4}-\d{1,2}-\d{1,2}', publish_date_text)[-1] real_publish_date = publish_date.split('-')[0] + '年' + str(int(publish_date.split('-')[1])) + '月' + str( int(publish_date.split('-')[2])) + '日' result_map = { 'announcementTitle': title, 'announcementOrg': '汕头银保监分局', 'announcementDate': real_publish_date, 'announcementCode': document_code, 'facts': truth, 'defenseOpinion': defense, 'defenseResponse': defense_response, 'litigant': litigant[:-1] if litigant[-1] == ':' else litigant, 'punishmentBasement': punishment_basis, 'punishmentDecision': punishment_decision, 'type': '行政处罚决定', 'oss_file_id': file_id, 'status': 'not checked' } logger.info(result_map) if db.announcement.find({'announcementTitle': title, 'oss_file_id': file_id}).count() == 0: db.announcement.insert_one(result_map) logger.info('汕头保监分局 数据解析 ' + ' -- 数据导入完成') else: logger.info('汕头保监分局 数据解析 ' + ' -- 数据已经存在') db.parsed_data.update_one({'_id': file_id}, {'$set': {'parsed': True}}) logger.info('汕头保监分局 数据解析 ' + ' -- 修改parsed完成')
def qingdao_circ(db, logger): for each_circ_data in db.circ_data.find({ 'origin': '青岛保监局', 'status': { '$nin': ['ignored'] } }): announcement_url = each_circ_data['url'] announcement_title = each_circ_data['title'] if db.circ_data.find({ 'url': announcement_url, 'status': 'parsed' }).count() == 1 and db.parsed_data.find({ 'origin_url': announcement_url, 'oss_file_origin_url': announcement_url, 'parsed': True }).count() == 1: continue logger.info('青岛保监局 ' + 'Url to parse: %s' % announcement_url) r = request_site_page(announcement_url) if r is None: logger.error('网页请求错误 %s' % announcement_url) continue content_soup = bs(r.content, 'lxml') if r else bs('', 'lxml') if db.parsed_data.find({ 'origin_url': announcement_url, 'oss_file_origin_url': announcement_url }).count() == 0: oss_file_map = { 'origin_url': announcement_url, 'oss_file_origin_url': announcement_url, 'origin_url_id': each_circ_data['_id'], 'oss_file_type': 'html', 'oss_file_name': announcement_title, 'oss_file_content': r.text.encode(r.encoding).decode('utf-8'), 'parsed': False } insert_response = db.parsed_data.insert_one(oss_file_map) file_id = insert_response.inserted_id oss_add_file(ali_bucket, str(file_id) + '/' + announcement_title + '.html', r.text.encode(r.encoding).decode('utf-8')) db.circ_data.update_one({'_id': each_circ_data['_id']}, {'$set': { 'status': 'parsed' }}) else: db.circ_data.update_one({'_id': each_circ_data['_id']}, {'$set': { 'status': 'parsed' }}) file_id = db.parsed_data.find_one({ 'origin_url': announcement_url, 'oss_file_origin_url': announcement_url })['_id'] table_content = content_soup.find(id='tab_content') if not table_content: logger.error('网页请求错误 %s' % announcement_url) continue content_text = get_content_text(table_content.find_all('tr')[3]) if content_text == '': continue title = table_content.find_all('tr')[0].text.strip() document_code_compiler = re.compile(r'(青岛保监罚.\d{4}.\d+号)') if document_code_compiler.search(content_text): document_code = document_code_compiler.search(content_text).group( 1).strip() litigant_compiler = re.compile( document_code.replace(r'[', r'\[').replace(r']', r'\]') + r'\n([\s\S]*?)\n' + r'(经查|经检查|依据.*?的有.*?关规定|抽查|经抽查|' r'.*?(现场检查|举报检查|业务核查|举报调查|检查中发现|青岛保监局关于整顿和规范保险市场的具体要求))') litigant = litigant_compiler.search(content_text).group(1).strip() else: if document_code_compiler.search(title): document_code = document_code_compiler.search(title).group( 1).strip() else: document_code = '' litigant_compiler = re.compile( r'^([\s\S]*?)\n' + r'(经查|经检查|依据.*?的有.*?关规定|抽查|经抽查|' r'.*?(现场检查|举报检查|举报调查|业务核查|检查中发现|青岛保监局关于整顿和规范保险市场的具体要求))') litigant = litigant_compiler.search(content_text).group(1).strip() truth_text_str = r'((经查|经检查)' \ r'([\s\S]*?))' \ r'((我局认为,)?(上述|以上|该).*?(事实|行为|事实)(,|,)?有.*?等(证据)?(材料)?(在案)?(证明|佐证)(,|,|。)(足以认定。)?|' \ r'(我局认为,|综上,)?(上述|以上).*?(行为|问题|事实).*?违反.*?第.*条.*?(的规定)?)' truth_compiler = re.compile(truth_text_str) if truth_compiler.search(content_text): truth = truth_compiler.search(content_text).group(1).strip() else: truth_text_str = litigant.replace(r'*', r'\*') + \ r'([\s\S]*?)' \ r'((我局认为,)?(上述|以上).*?(事实|行为|事实行为)(,|,)?有.*?等(证据)?(在案)?(证明|佐证|作证)(,|,|。)(足以认定。)?|' \ r'(我局认为,|综上,)?(上述|以上).*?(行为|问题|事实).*?违反.*?第.*条.*?(的规定)?|' \ r'上述违反了《中华人民共和国保险法》第一百二十二条的规定|' \ r'上述行为《中华人民共和国保险法》第一百二十二条的规定)' truth_compiler = re.compile(truth_text_str) truth = truth_compiler.search(content_text).group(1).strip() if '申辩' in content_text: defense_text_str = r'((针对.*?行为.*?申辩意见|(当事人)?[^,。,;\n]*?(未)?提出(了)?陈述申辩(意见)?|' \ r'[^,。,;\n]*?向我局(报送|递交|提出)[^,。,;\n]*?|本案在审理过程中.*?提出陈述申辩|' \ r'[^,。,;\n]*?在(申辩材料|陈述申辩|陈述申辩意见|申辩意见)中称|[^,。,;\n]*?在听证阶段提出|' \ r'[^,。,;\n]*?在法定期限内(未)?提出(了)?(听证要求|陈述申辩|陈述申辩及听证要求)|' \ r'当事人在法定期限内未提出申辩意见|你公司在规定时间内提出陈述申辩并提出三条理由|' \ r'在规定期限内,你提出了陈述和申辩|你.*?在申辩书中提出|你公司提到的.*?陈述申辩材料)' \ r'([\s\S]*?))' \ r'(因此,我局决定|' \ r'我局经复核(认为|决定)|' \ r'本案现已审理终结|' \ r'我局经复查[^,。,;\n]*?情况|' \ r'我局[^,。,;\n]*?认真复核|' \ r'经研究,对[^,。,;\n]*?予以采纳。|' \ r'我局认为.*?申辩理由|' \ r'依据.*?我局认为.*?的申辩理由|' \ r'经研究,我局认为.*?申辩意见|' \ r'经我局审核,决定|' \ r'我局认为,上述违法行为事实清楚、证据确凿、法律法规适当|' \ r'我局对陈述申辩意见进行了复核|' \ r'经我局审核|' \ r'针对[^,。,;\n]*?的(陈述)?申辩意见,我局进行了核实|' \ r'经查,我局认为|' \ r'依据现场检查及听证情况|' \ r'我局依法对陈述申辩意见进行了复核|' \ r'对你公司|' \ r'经复核|' \ r'我局在处罚裁量时已充分考虑此情节|' \ r'没有相关证据支持)' defense_compiler = re.compile(defense_text_str, re.MULTILINE) defense_list = defense_compiler.findall(content_text) if len(defense_list) != 0: defense = defense_list[-1][0].strip() defense_response_str = defense.replace(r'[', r'\[').replace(r']', r'\]') \ + r'(([\s\S]*?)' + r'(本案现已审理终结。|不符合.*?情形。|根据.*?依法可从轻或者减轻行政处罚。|' \ r'对[^,。,;\n]*?申辩意见(不予|予以)采纳|因此.*?申辩理由.*?成立。|' \ r'予以采纳。|此申辩不被采纳。|我局不予采纳。|' \ r'我局认为.*?申辩(理由|意见).*?符合.*?第.*?条.*?的条件.(予以采纳。)?))' defense_response_compiler = re.compile(defense_response_str, re.MULTILINE) if defense_response_compiler.search(content_text): defense_response = defense_response_compiler.search( content_text).group(1).strip() else: if '未' in defense: defense_response = '' else: defense_text_str = '([^。;\n]*?向.*?公告送达了《行政处罚事先告知书》.*?提出陈述申辩。|' \ '我局依法于2012年5月25日对你公司送达了《行政处罚事先告知书》,你公司在规定的时间内未提出陈述和申辩意见,也未要求举行听证。)' defense_compiler = re.compile(defense_text_str, re.MULTILINE) defense = defense_compiler.search(content_text).group( 1).strip() defense_response = '' else: defense = defense_response = '' punishment_decision_text_str = r'(((依据|根据).*?第?.*?条.*?(规定)?.?(我局)?(决定|责令|给予|于.*?向.*?发出|对.*?作出|拟对你)|' \ r'我局经复核认为|我局决定.*?作出(如下|以下)(行政)?处罚:|我局拟对|我局决定对)' \ r'([\s\S]*?))' \ r'(请在本处罚决定书送达之日|当事人应当在接到本处罚决定书之日|如不服本处罚决定|' \ r'请(在)?接到本处罚决定书之日|如不服从本处罚决定|当事人如对本处罚决定不服|' \ r'你公司应严格按照《关于保险公司缴纳罚款等有关问题的通知》)' punishment_decision_compiler = re.compile(punishment_decision_text_str) punishment_decision = punishment_decision_compiler.search( content_text).group(1).strip() punishment_basis_str_list = [ r'([^\n。;]*?)(问题|行为|事项|情况|事实)([^\n。;\s]*?)违反.*?\n?.*?第.*?条?\n?.*?((的|之|等)(相关)?规定)?', r'上述违反了《中华人民共和国保险法》第一百二十二条的规定', r'上述行为《中华人民共和国保险法》第一百二十二条的规定' ] punishment_basis_str = '|'.join(punishment_basis_str_list) punishment_basis_compiler = re.compile( r'[。\n;]' + '(' + punishment_basis_str + ')' + '.(\n?(应当)?依据|\n?(应当)?根据|\n?鉴于|\n?我局决定|\n?按照)', re.MULTILINE) punishment_basis_list = punishment_basis_compiler.findall(content_text) punishment_basis = ';'.join( [kk[0].strip() for kk in punishment_basis_list]) publish_date_text = re.search( punishment_decision.replace(r'(', r'\(').replace( r')', r'\)').replace(r'[', r'\[').replace(r']', r'\]').replace( r'*', r'\*') + r'([\s\S]*?)$', content_text).group(1).replace('\n', '') if re.search(r'.{4}年.{1,2}月.{1,3}日', publish_date_text): publish_date = re.findall('.{4}年.{1,2}月.{1,3}日', publish_date_text)[-1].replace(' ', '') m = re.match( "([0-9零一二两三四五六七八九十〇○OOΟО]+年)?([0-9一二两三四五六七八九十]+)月?([0-9一二两三四五六七八九十]+)[号日]?", publish_date) real_publish_date = get_year(m.group(1)) + str(cn2dig( m.group(2))) + '月' + str(cn2dig(m.group(3))) + '日' else: publish_date_text = table_content.find_all('tr')[1].text publish_date = re.findall(r'\d{4}-\d{1,2}-\d{1,2}', publish_date_text)[-1] real_publish_date = publish_date.split('-')[0] + '年' + str( int(publish_date.split('-')[1])) + '月' + str( int(publish_date.split('-')[2])) + '日' result_map = { 'announcementTitle': title, 'announcementOrg': '青岛银保监局', 'announcementDate': real_publish_date, 'announcementCode': document_code, 'facts': truth, 'defenseOpinion': defense, 'defenseResponse': defense_response, 'litigant': litigant[:-1] if litigant[-1] == ':' else litigant, 'punishmentBasement': punishment_basis, 'punishmentDecision': punishment_decision, 'type': '行政处罚决定', 'oss_file_id': file_id, 'status': 'not checked' } logger.info(result_map) if db.announcement.find({ 'announcementTitle': title, 'oss_file_id': file_id }).count() == 0: db.announcement.insert_one(result_map) logger.info('青岛保监局 数据解析 ' + ' -- 数据导入完成') else: logger.info('青岛保监局 数据解析 ' + ' -- 数据已经存在') db.parsed_data.update_one({'_id': file_id}, {'$set': {'parsed': True}}) logger.info('青岛保监局 数据解析 ' + ' -- 修改parsed完成')
def csrc_crawler(): # 行政处罚决定 + 市场禁入决定 url_list = [ { 'page_url': 'http://www.csrc.gov.cn/pub/zjhpublic/index.htm?channel=3300/3313', 'request_url': 'http://www.csrc.gov.cn/pub/zjhpublic/3300/3313/index_7401' }, { 'page_url': 'http://www.csrc.gov.cn/pub/zjhpublic/index.htm?channel=3300/3619', 'request_url': 'http://www.csrc.gov.cn/pub/zjhpublic/3300/3619/index_7401' }, ] # 责令整改通知 url2_list = [ 'http://www.csrc.gov.cn/pub/newsite/xzcfw/zlzgtz/index', ] # 要闻 url3_list = [ 'http://www.csrc.gov.cn/pub/newsite/zjhxwfb/xwdd/index' ] new_csrc_announcement_list = [] for index, each_url_info in enumerate(url_list): logger.info('行政处罚决定' if index == 0 else '市场禁入决定') # get page_count page_count_url = each_url_info['request_url'] + '.htm' response = request_site_page(page_count_url) if response is None: logger.error('网页请求错误 %s' % page_count_url) continue page_count = int(int(re.search(r'var m_nRecordCount = "(\d+)"?;', response.text).group(1).strip()) / 20 + 1) logger.info(('行政处罚决定' if index == 0 else '市场禁入决定') + ' -- 一共有%d页' % page_count) # get crawler data for i in range(page_count): logger.info(('行政处罚决定' if index == 0 else '市场禁入决定') + ' -- 第%d页' % (i + 1)) url = each_url_info['request_url'] + '_' + str(i) + '.htm' if i > 0 \ else each_url_info['request_url'] + '.htm' try: content_response = request_site_page(url) if content_response is None: logger.error('网页请求错误 %s' % url) continue content_soup = bs(content_response.content, 'lxml') if content_response else bs('', 'lxml') dl_content = content_soup.find(id='documentContainer') if not dl_content: logger.error('网页请求错误 %s' % url) continue for each_dd in dl_content.find_all(class_='row'): try: if len(each_dd.find_all('a')) > 0: announcement_url = urljoin(url, each_dd.find('a').attrs['href']) if db.csrc_data.find({'url': announcement_url}).count() == 0: title = each_dd.find('a').text.strip() announcement_date = each_dd.find(class_='fbrq').text.strip() logger.info('证监会' + ('行政处罚决定' if index == 0 else '市场禁入决定') + '新公告:' + announcement_url) post = { 'title': title, 'publishDate': announcement_date, 'url': announcement_url, 'type': '行政处罚决定' if index == 0 else '市场禁入决定', 'origin': '证监会', 'status': 'not parsed' } if post not in new_csrc_announcement_list: new_csrc_announcement_list.append(post) else: if config['crawler_update_type']['update_type'] == '0': break except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue except Exception as e: logger.error(e) continue for each_url in url2_list: logger.info('责令整改通知') # get page_count page_count_url = each_url + '.htm' response = request_site_page(page_count_url) if response is None: logger.error('网页请求错误 %s' % page_count_url) continue soup = bs(response.content, 'lxml') if response else bs('', 'lxml') page_count_text = soup.find(class_='page').text if soup.find(class_='page') else '' page_count = int(re.findall(r'\d+', page_count_text)[0]) if page_count_text != '' else 0 logger.info('责令整改通知 -- 一共有%d页' % page_count) # get crawler data for i in range(page_count): logger.info('责令整改通知 -- 第%d页' % (i + 1)) url = each_url + '_' + str(i) + '.htm' if i > 0 else each_url + '.htm' try: content_response = request_site_page(url) if content_response is None: logger.error('网页请求错误 %s' % url) continue content_soup = bs(content_response.content, 'lxml') if content_response else bs('', 'lxml') dl_content = content_soup.find(id='myul') if not dl_content: logger.error('网页请求错误 %s' % url) continue for each_dd in dl_content.find_all('li'): try: if len(each_dd.find_all('a')) > 0: announcement_url = urljoin(url, each_dd.find('a').attrs['href']) if db.csrc_data.find({'url': announcement_url}).count() == 0: title = each_dd.find('a').attrs['title'].strip() announcement_date = each_dd.find('span').text.strip() logger.info('证监会责令整改通知新公告:' + announcement_url) post = { 'title': title, 'publishDate': announcement_date, 'url': announcement_url, 'type': '责令整改通知', 'origin': '证监会', 'status': 'not parsed' } if post not in new_csrc_announcement_list: new_csrc_announcement_list.append(post) else: if config['crawler_update_type']['update_type'] == '0': break except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue except Exception as e: logger.error(e) continue for each_url in url3_list: logger.info('要闻') # get page_count page_count_url = each_url + '.html' response = request_site_page(page_count_url) if response is None: logger.error('网页请求错误 %s' % page_count_url) continue soup = bs(response.content, 'lxml') if response else bs('', 'lxml') page_count = int(re.search(r'var countPage = (\d+)', soup.text).group(1).strip()) logger.info('要闻 -- 一共有%d页' % page_count) # get crawler data for i in range(page_count): logger.info('要闻 -- 第%d页' % (i + 1)) url = each_url + '_' + str(i) + '.html' if i > 0 else each_url + '.html' try: content_response = request_site_page(url) if content_response is None: logger.error('网页请求错误 %s' % url) continue content_soup = bs(content_response.content, 'lxml') if content_response else bs('', 'lxml') dl_content = content_soup.find(id='myul') if not dl_content: logger.error('网页请求错误 %s' % url) continue for each_dd in dl_content.find_all('li'): try: if len(each_dd.find_all('a')) > 0: title = each_dd.find('a').attrs['title'].strip() if re.search(r'^证监会.*?作出行政处罚(决定)?$', title) or '现场检查情况' in title: announcement_url = urljoin(url, each_dd.find('a').attrs['href']) if db.csrc_data.find({'url': announcement_url}).count() == 0: announcement_date = each_dd.find('span').text.strip() logger.info('证监会要闻新公告:' + announcement_url) post = { 'title': title, 'publishDate': announcement_date, 'url': announcement_url, 'type': '要闻', 'origin': '证监会', 'status': 'not parsed' } if post not in new_csrc_announcement_list: new_csrc_announcement_list.append(post) else: if config['crawler_update_type']['update_type'] == '0': break except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue except Exception as e: logger.error(e) continue if len(new_csrc_announcement_list) > 0: logger.info('证监会一共有%d条新公告,导入数据库中......' % len(new_csrc_announcement_list)) r = db.csrc_data.insert_many(new_csrc_announcement_list) if len(r.inserted_ids) == len(new_csrc_announcement_list): logger.info('证监会公告导入完成!') else: logger.error('证监会公告导入出现问题!') else: logger.info('证监会没有新公告!')
def cq_crawler_first(url_format): result_list = [] response = request_site_page(url_format.format('')) logger.info("{} 抓取URL:{}".format(gov_name, url_format.format(''))) stop_flag = False if response is None: logger.error('网页请求错误{}'.format(url_format.format(''))) soup = bs(response.content if response else '', 'lxml') page_count_text = soup.body.find(class_='fenye') page_count = int(re.findall(r'\d+', str(page_count_text.text) if page_count_text else '')[1]) logger.info('{} 一共有{}页'.format(gov_name, page_count)) if db.crawler.find({'url': url_format.format('')}).count() > 0: last_updated_url = db.crawler.find_one({'url': url_format.format('')})['last_updated'] else: last_updated_url = '' for num in range(page_count): url = url_format.format('_' + str(num + 1) if num != 0 else '') try: response = request_site_page(url) logger.info('第%d页' % (num + 1)) if response is None: logger.error('网页请求错误{}'.format(url)) soup = bs(response.content if response else '', 'lxml') for index, x in enumerate(soup.body.find(class_='list_main_right_content').find_all('li')): anc_url = x.a.attrs['href'].strip() if not anc_url.startswith('http'): anc_url = urljoin(url_format.format(''), anc_url) if anc_url == last_updated_url: stop_flag = True logger.info('到达上次爬取的链接') break if num == 0 and index == 0: if db.crawler.find({'url': url_format.format('')}).count() > 0: if db.crawler.find_one({'url': url_format.format('')})['last_updated'] != anc_url: db.crawler.update_one({'url': url_format.format('')}, {'$set': {'last_updated': anc_url}}) else: db.crawler.insert_one( {'url': url_format.format(''), 'last_updated': anc_url, 'origin': gov_name}) if db[collection_name].count_documents({'url': anc_url}) == 0: info = { 'title': x.a.text, 'publishDate': x.span.text, 'url': anc_url, 'type': '行政处罚决定', 'origin': gov_name, 'status': 'not parsed' } logger.info('{} 新公告:{} url: {}'.format(gov_name, info['title'], anc_url)) if info not in result_list: result_list.append(info) else: if config['crawler_update_type']['update_type'] == '0': break if stop_flag: logger.info('到达上次爬取的链接') break except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue if len(result_list) > 0: logger.info('{}一共有{}条新公告,导入数据库中......'.format(gov_name, len(result_list))) r = db[collection_name].insert_many(result_list) if len(r.inserted_ids) == len(result_list): logger.info('{}公告导入完成!'.format(gov_name)) else: logger.error('{}公告导入出现问题!'.format(gov_name)) else: logger.info('{}没有新公告!'.format(gov_name))
def nafmii_crawler(): result_list = [] prefix_url = 'http://www.nafmii.org.cn/zlgl/zwrz/zlcf/' # get page count response = request_site_page(prefix_url) if response is None: logger.error('网页请求错误 %s' % prefix_url) return page_count = int( re.search(r'var countPage = (\d+)', response.text).group(1).strip()) logger.info('交易商协会' + ' 一共有%d页' % page_count) # get crawler data for num in range(page_count): logger.info('交易商协会 -- 第%d页' % (num + 1)) if num == 0: url = prefix_url + 'index.html' else: url = prefix_url + 'index_' + str(num) + '.html' try: content_response = request_site_page(url) if content_response is None: logger.error('网页请求错误 %s' % url) continue content_soup = bs(content_response.content, 'lxml') if content_response else '' table_content = content_soup.find_all('table')[-4] for each_tr in table_content.find_all('tr')[2:-2]: try: announcement_url = urljoin( url, each_tr.find('a').attrs['href'].strip()) if db.nafmii_data.find({ 'url': announcement_url }).count() == 0: title = each_tr.find('a').text.strip() publish_date = each_tr.find_all('td')[-1].text.replace( '/', '-').strip() logger.info('交易商协会 -- 新公告:' + announcement_url) post = { 'title': title, 'publishDate': publish_date, 'url': announcement_url, 'type': '行政处罚决定', 'origin': '交易商协会', 'status': 'not parsed' } if post not in result_list: result_list.append(post) else: if config['crawler_update_type']['update_type'] == '0': break except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue except Exception as e: logger.error(e) continue if len(result_list) > 0: logger.info('交易商协会 -- 一共有%d条新公告,导入数据库中......' % len(result_list)) r = db.nafmii_data.insert_many(result_list) if len(r.inserted_ids) == len(result_list): logger.info('交易商协会 -- 公告导入完成!') else: logger.error('交易商协会 -- 公告导入出现问题!') else: logger.info('交易商协会 -- 没有新公告!')
def sxsczt_crawler(): result_list = [] # 用来保存最后存入数据库的数据 prefix_url = [{ 'url': 'http://www.sxscz.gov.cn/cms_search.action?searchValue=%E5%A4%84%E7%BD%9A&classId=', 'origin': '山西省财政厅' }] for each_url_info in prefix_url: each_url = each_url_info['url'] stop_flag = False logger.info('山西省财政厅 抓取URL:' + each_url) # get page count base_page = request_site_page(each_url) if base_page is None: logger.error('网页请求错误 %s' % each_url) continue base_soup = BeautifulSoup( base_page.text.encode(base_page.encoding).decode('utf-8'), 'lxml') try: page_count_text = base_soup.find(class_='page_li_1').text.strip() page_count = int(re.findall(r'\d+', page_count_text)[1]) except Exception as e: logger.warning(e) page_count = 0 logger.info('一共有%d页' % page_count) if db.crawler.find({'url': each_url}).count() > 0: last_updated_url = db.crawler.find_one({'url': each_url})['last_updated'] else: last_updated_url = '' # get crawler data for page_num in range(page_count): logger.info('第' + str(page_num + 1) + '页') url = each_url + '&pageNo=' + str(page_num + 1) try: page_response = request_site_page(url) if page_response is None: logger.error('网页请求错误 %s' % url) continue page_soup = BeautifulSoup( page_response.text.encode( page_response.encoding).decode('utf-8'), 'lxml') all_tr = page_soup.find(attrs={ "class": "list_bd new_cont nianbao" }).find_all('li') for index, each_tr in enumerate(all_tr): title = each_tr.find('a').text.strip() href = each_tr.find('a')['href'].strip() true_url = urljoin(url, href) # 判断是否为之前抓取过的 if true_url == last_updated_url: stop_flag = True logger.info('到达上次爬取的链接') break # 更新抓取的分割线 if page_num == 0 and index == 0: if db.crawler.find({'url': each_url}).count() > 0: if db.crawler.find_one( {'url': each_url})['last_updated'] != true_url: db.crawler.update_one( {'url': each_url}, {'$set': { 'last_updated': true_url }}) else: db.crawler.insert_one({ 'url': each_url, 'last_updated': true_url, 'origin': each_url_info['origin'] }) if re.search('(行政处罚)', title): date = each_tr.find('p').text.strip() publish_date = re.search(r'(\d{4}-\d{2}-\d{2})', date).group(1).strip() if db.finance_data.find({ 'url': true_url }).count() == 0: logger.info('山西省财政厅新公告:' + true_url + ' title: ' + title) post = { 'title': title, 'publishDate': publish_date, 'url': true_url, 'type': '行政处罚决定', 'origin': '山西省财政厅', 'status': 'not parsed' } if post not in result_list: result_list.append(post) else: if config['crawler_update_type'][ 'update_type'] == '0': break if stop_flag: logger.info('到达上次爬取的链接') break except Exception as e: logger.error(e) continue if len(result_list) > 0: logger.info('山西省财政厅一共有%d条新公告,导入数据库中......' % len(result_list)) r = db.finance_data.insert_many(result_list) if len(r.inserted_ids) == len(result_list): logger.info('山西省财政厅公告导入完成!') else: logger.error('山西省财政厅公告导入出现问题!') else: logger.info('山西省财政厅没有新公告!')
def jxczt_crawler(): result_list = [] # 用来保存最后存入数据库的数据 prefix_url = [ { 'url': 'http://www.jxf.gov.cn/govDepAction_cate_002_subCate_002004_page.nowPage_1.shtml', 'request_url': 'http://www.jxf.gov.cn/govDepAction_cate_002_subCate_002004_page.nowPage_', 'origin': '江西省财政厅' } ] for each_url_info in prefix_url: each_url = each_url_info['url'] stop_flag = False logger.info('江西省财政厅 抓取URL:' + each_url) # get page count base_page = request_site_page(each_url) if base_page is None: logger.error('网页请求错误 %s' % each_url) continue base_soup = BeautifulSoup(base_page.text.encode(base_page.encoding).decode('utf-8'), 'lxml') try: content_table = base_soup.find_all('table')[-7] page_count_text = content_table.find_all('tr')[-1].text.strip() page_count = int(re.findall(r'\d+', page_count_text)[2]) except Exception as e: logger.warning(e) page_count = 0 logger.info('一共有%d页' % page_count) if db.crawler.find({'url': each_url}).count() > 0: last_updated_url = db.crawler.find_one({'url': each_url})['last_updated'] else: last_updated_url = '' # get crawler data for page_num in range(page_count): logger.info('第' + str(page_num + 1) + '页') url = each_url_info['request_url'] + str(page_num + 1) + '.shtml' try: page_response = request_site_page(url) if page_response is None: logger.error('网页请求错误 %s' % url) continue page_soup = BeautifulSoup(page_response.text.encode(page_response.encoding).decode('utf-8'), 'lxml') content_table = page_soup.find_all('table')[-5] all_li = content_table.find_all('tr') for index, each_result in enumerate(all_li): if len(each_result.find_all('a')) == 0: continue title = each_result.find('a').attrs['title'].strip() href = each_result.find('a').attrs['href'].strip() true_url = urljoin(url, href) # 判断是否为之前抓取过的 if true_url == last_updated_url: stop_flag = True logger.info('到达上次爬取的链接') break # 更新抓取的分割线 if page_num == 0 and index == 0: if db.crawler.find({'url': each_url}).count() > 0: if db.crawler.find_one({'url': each_url})['last_updated'] != true_url: db.crawler.update_one({'url': each_url}, {'$set': {'last_updated': true_url}}) else: db.crawler.insert_one( {'url': each_url, 'last_updated': true_url, 'origin': each_url_info['origin']}) publish_date = each_result.find_all('td')[-1].text.strip() if re.search('.*决定书.*', title): if db.finance_data.find({'url': true_url}).count() == 0: logger.info('江西省财政厅新公告:' + true_url + ' title: ' + title) post = { 'title': title, 'publishDate': publish_date, 'url': true_url, 'type': '行政处罚决定', 'origin': '江西省财政厅', 'status': 'not parsed' } if post not in result_list: result_list.append(post) else: if config['crawler_update_type']['update_type'] == '0': break if stop_flag: logger.info('到达上次爬取的链接') break except Exception as e: logger.error(e) continue if len(result_list) > 0: logger.info('江西省财政厅一共有%d条新公告,导入数据库中......' % len(result_list)) r = db.finance_data.insert_many(result_list) if len(r.inserted_ids) == len(result_list): logger.info('江西省财政厅公告导入完成!') else: logger.error('江西省财政厅公告导入出现问题!') else: logger.info('江西省财政厅没有新公告!')
def local_csrc_crawler(): # 已有单独页面的行政处罚决定链接 xzcf_url_list = [ {'url': 'http://www.csrc.gov.cn/pub/beijing/bjxyzl/bjxzcf/', 'area': '北京证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/beijing/bjxzcf/', 'area': '北京证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/tianjin/xzcf/', 'area': '天津证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/hebei/hbxzcf/', 'area': '河北证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/shanxi/xzcf/', 'area': '山西证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/neimenggu/nmgxzcf/', 'area': '内蒙古证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/liaoning/lnjxzcf/', 'area': '辽宁证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/jilin/jlxzcf/', 'area': '吉林证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/heilongjiang/hljjxzcf/', 'area': '黑龙江证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/shanghai/xzcf/', 'area': '上海证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/jiangsu/jsxzcf/', 'area': '江苏证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/zhejiang/zjxzcf/', 'area': '浙江证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/anhui/ahxzcf/', 'area': '安徽证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/fujian/fjjxzcf/', 'area': '福建证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/jiangxi/jxxzcf/', 'area': '江西证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/shandong/sdxzcf/', 'area': '山东证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/henan/hnxzcf/', 'area': '河南证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/hubei/hbxzcf/', 'area': '湖北证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/hunan/hnxzcf/', 'area': '湖南证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/guangdong/xzcf/', 'area': '广东证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/hainan/hnjxzcf/', 'area': '海南证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/chongqing/cqjxzcf/', 'area': '重庆证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/sichuan/scxzcf/', 'area': '四川证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/guizhou/gzxzcf/', 'area': '贵州证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/xizang/xzxzcf/', 'area': '西藏证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/qinghai/qhxzcf/', 'area': '青海证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/xinjiang/xjxzcf/', 'area': '新疆证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/shenzhen/xzcf/', 'area': '深圳证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/dalian/dlxzcf/', 'area': '大连证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/ningbo/nbxzcf/', 'area': '宁波证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/xiamen/xmxzcf/', 'area': '厦门证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/qingdao/xzcf/', 'area': '青岛证监局', 'type': '行政处罚决定'} ] # 已有单独页面的监管措施链接 jgcs_url_list = [ {'url': 'http://www.csrc.gov.cn/pub/beijing/bjxyzl/bjxzjgcs/', 'area': '北京证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/beijing/bjjgcs/', 'area': '北京证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/jilin/jljgcs/', 'area': '吉林证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/heilongjiang/jgcs/', 'area': '黑龙江证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zhejiang/zjcxxx/', 'area': '浙江证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/hunan/hnjxzjgcs/', 'area': '湖南证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/guangdong/gdjjgcs/', 'area': '广东证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/shenzhen/ztzl/ssgsjgxx/jgcs/', 'area': '深圳证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/dalian/dljgcs/', 'area': '大连证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/xiamen/xmjgcs/', 'area': '厦门证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/qingdao/jgcs/', 'area': '青岛证监局', 'type': '监管措施'} ] # 以下地址检索标题中含有“行政处罚决定书”的公告 xzcf_search_url_list = [ {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofheb/', 'area': '河北证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofsx/', 'area': '山西证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofhlj/', 'area': '黑龙江证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofjs/', 'area': '江苏证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofsd/', 'area': '山东证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofhen/', 'area': '河南证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofhn/', 'area': '湖南证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofgx/', 'area': '广西证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofhan/', 'area': '海南证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofyn/', 'area': '云南证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofxz/', 'area': '西藏证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofsax/', 'area': '陕西证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofgs/', 'area': '甘肃证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofnx/', 'area': '宁夏证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofsz/', 'area': '深圳证监局', 'type': '行政处罚决定'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofqd/', 'area': '青岛证监局', 'type': '行政处罚决定'}, ] # 以下地址检索标题中含有“行政处罚”的公告 xzcf_search_url_other_list = [ {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofsh/', 'area': '上海证监局', 'type': '行政处罚决定'} ] # 搜索名称中有“措施的决定”或者“行政监管措施决定书”的公告 jgcs_search_url_list = [ {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofbj/', 'area': '北京证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicoftj/', 'area': '天津证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofheb/', 'area': '河北证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofsx/', 'area': '山西证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofnmg/', 'area': '内蒙古证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofln/', 'area': '辽宁证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofjl/', 'area': '吉林证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofhlj/', 'area': '黑龙江证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofsh/', 'area': '上海证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofjs/', 'area': '江苏证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofzj/', 'area': '浙江证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofah/', 'area': '安徽证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicoffj/', 'area': '福建证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofjx/', 'area': '江西证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofsd/', 'area': '山东证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofhen/', 'area': '河南证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofhb/', 'area': '湖北证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofhn/', 'area': '湖南证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofgd/', 'area': '广东证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofgx/', 'area': '广西证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofhan/', 'area': '海南证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofcq/', 'area': '重庆证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofsc/', 'area': '四川证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofgz/', 'area': '贵州证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofyn/', 'area': '云南证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofxz/', 'area': '西藏证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofsax/', 'area': '陕西证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofgs/', 'area': '甘肃证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofqh/', 'area': '青海证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofnx/', 'area': '宁夏证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofxj/', 'area': '新疆证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofsz/', 'area': '深圳证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofdl/', 'area': '大连证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofnb/', 'area': '宁波证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofxm/', 'area': '厦门证监局', 'type': '监管措施'}, {'url': 'http://www.csrc.gov.cn/pub/zjhpublicofqd/', 'area': '青岛证监局', 'type': '监管措施'} ] logger.info('地方证监局 数据抓取') new_local_csrc_announcement_list = [] # 已有单独页面的行政处罚决定链接 for each_xzcf_url_info in xzcf_url_list: logger.info(each_xzcf_url_info['area'] + each_xzcf_url_info['type'] + ' ' + each_xzcf_url_info['url']) # get page_count page_count_url = each_xzcf_url_info['url'] response = request_site_page(page_count_url) if response is None: logger.error('网页请求错误 %s' % page_count_url) continue try: page_count = int(re.search(r'var countPage = (\d+)?//共多少页', response.text.encode(response.encoding).decode('utf-8')).group(1).strip()) \ if re.search(r'var countPage = (\d+)?//共多少页', response.text.encode(response.encoding).decode('utf-8')) else 0 logger.info(each_xzcf_url_info['area'] + each_xzcf_url_info['type'] + ' -- 一共有%d页' % page_count) except Exception as e: logger.error(e) page_count = 0 # get crawler data for i in range(page_count): logger.info(each_xzcf_url_info['area'] + each_xzcf_url_info['type'] + '-- 第%d页' % (i + 1)) url = each_xzcf_url_info['url'] + 'index_' + str(i) + '.html' if i > 0 \ else each_xzcf_url_info['url'] + 'index.html' try: content_response = request_site_page(url) if content_response is None: logger.error('网页请求错误 %s' % url) continue content_soup = bs(content_response.content, 'lxml') if content_response else bs('', 'lxml') dl_content = content_soup.find(class_='fl_list') if not dl_content: logger.error('网页请求错误 %s' % url) continue for each_dd in dl_content.find_all('li'): if len(each_dd.find_all('a')) > 0: try: announcement_url = urljoin(url, each_dd.find('a').attrs['href']) if db.csrc_data.find({'url': announcement_url}).count() == 0: title = each_dd.find('a').text.strip() announcement_date = each_dd.find('span').text.strip() logger.info( each_xzcf_url_info['area'] + each_xzcf_url_info['type'] + '新公告:' + announcement_url) post = { 'title': title, 'publishDate': announcement_date, 'url': announcement_url, 'type': each_xzcf_url_info['type'], 'origin': each_xzcf_url_info['area'], 'status': 'not parsed' } if post not in new_local_csrc_announcement_list: new_local_csrc_announcement_list.append(post) else: if config['crawler_update_type']['update_type'] == '0': break except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue except Exception as e: logger.error(e) continue logger.info('\n') # 已有单独页面的监管措施链接 for each_jgcs_url_info in jgcs_url_list: logger.info(each_jgcs_url_info['area'] + each_jgcs_url_info['type'] + ' ' + each_jgcs_url_info['url']) # get page_count page_count_url = each_jgcs_url_info['url'] response = request_site_page(page_count_url) if response is None: logger.error('网页请求错误 %s' % page_count_url) continue try: page_count = int(re.search(r'var countPage = (\d+)?//共多少页', response.text.encode(response.encoding).decode('utf-8')).group(1).strip()) if \ re.search(r'var countPage = (\d+)?//共多少页', response.text.encode(response.encoding).decode('utf-8')) else 0 logger.info(each_jgcs_url_info['area'] + each_jgcs_url_info['type'] + ' -- 一共有%d页' % page_count) except Exception as e: logger.error(e) page_count = 0 # get crawler data for i in range(page_count): logger.info(each_jgcs_url_info['area'] + each_jgcs_url_info['type'] + ' -- 第%d页' % (i + 1)) url = each_jgcs_url_info['url'] + 'index_' + str(i) + '.html' if i > 0 \ else each_jgcs_url_info['url'] + 'index.html' try: content_response = request_site_page(url) if content_response is None: logger.error('网页请求错误 %s' % url) continue content_soup = bs(content_response.content, 'lxml') if content_response else bs('', 'lxml') dl_content = content_soup.find(class_='fl_list') if not dl_content: logger.error('网页请求错误 %s' % url) continue for each_dd in dl_content.find_all('li'): try: if len(each_dd.find_all('a')) > 0: announcement_url = urljoin(url, each_dd.find('a').attrs['href']) if db.csrc_data.find({'url': announcement_url}).count() == 0: title = each_dd.find('a').attrs['title'].strip() announcement_date = each_dd.find('span').text.strip() logger.info( each_jgcs_url_info['area'] + each_jgcs_url_info['type'] + '新公告:' + announcement_url) post = { 'title': title, 'publishDate': announcement_date, 'url': announcement_url, 'type': each_jgcs_url_info['type'], 'origin': each_jgcs_url_info['area'], 'status': 'not parsed' } if post not in new_local_csrc_announcement_list: new_local_csrc_announcement_list.append(post) else: if config['crawler_update_type']['update_type'] == '0': break except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue except Exception as e: logger.error(e) continue logger.info('\n') # 以下地址检索标题中含有“行政处罚决定书”的公告 for each_xzcf_search_url_info in xzcf_search_url_list: logger.info(each_xzcf_search_url_info['area'] + each_xzcf_search_url_info['type'] + '检索' + ' ' + each_xzcf_search_url_info['url']) # get page_count page_count_url = each_xzcf_search_url_info['url'] params = { 'SType': '1', 'searchColumn': 'biaoti', 'searchYear': 'all', 'preSWord': 'docTitle=("行政处罚决定书")', 'sword': '行政处罚决定书', 'searchAgain': '', 'page': 1, 'res_wenzhong': '', 'res_wenzhonglist': '', 'wenzhong': '', 'pubwebsite': '/' + page_count_url.split('/')[-2] + '/' } response = request_site_page('http://www.csrc.gov.cn/wcm/govsearch/simp_gov_list.jsp', params=params, methods='post') if response is None: logger.error('网页请求错误') continue try: page_count = int(int(re.search(r'var m_nRecordCount = (\d+)?;', response.text).group(1).strip()) / 20 + 1) logger.info( each_xzcf_search_url_info['area'] + each_xzcf_search_url_info['type'] + '检索 一共有%d页' % page_count) except Exception as e: logger.error(e) page_count = 0 # get crawler data for i in range(page_count): logger.info( each_xzcf_search_url_info['area'] + each_xzcf_search_url_info['type'] + '检索 第%d页' % (i + 1)) params['page'] = i + 1 try: content_response = request_site_page('http://www.csrc.gov.cn/wcm/govsearch/simp_gov_list.jsp', params=params, methods='post') if content_response is None: logger.error('网页请求错误') continue content_soup = bs(content_response.content, 'lxml') if content_response else bs('', 'lxml') dl_content = content_soup.find(id='documentContainer') if not dl_content: logger.error('网页请求错误') continue for each_row in dl_content.find_all(class_='row'): try: announcement_url = urljoin(each_xzcf_search_url_info['url'], each_row.find('a').attrs['href']) announcement_url = announcement_url.split('?')[0].strip() if db.csrc_data.find({'url': announcement_url}).count() == 0: title = each_row.find('a').text.strip() announcement_date = each_row.find(class_='fbrq').text.strip() logger.info(each_xzcf_search_url_info['area'] + each_xzcf_search_url_info['type'] + '检索新公告:' + announcement_url) post = { 'title': title, 'publishDate': announcement_date, 'url': announcement_url, 'type': each_xzcf_search_url_info['type'], 'origin': each_xzcf_search_url_info['area'], 'status': 'not parsed' } if post not in new_local_csrc_announcement_list: new_local_csrc_announcement_list.append(post) else: if config['crawler_update_type']['update_type'] == '0': break except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue except Exception as e: logger.error(e) continue logger.info('\n') # 以下地址检索标题中含有“行政处罚”的公告 for each_xzcf_search_url_other_info in xzcf_search_url_other_list: logger.info(each_xzcf_search_url_other_info['area'] + each_xzcf_search_url_other_info['type'] + '检索' + ' ' + each_xzcf_search_url_other_info['url']) # get page_count page_count_url = each_xzcf_search_url_other_info['url'] params = { 'SType': '1', 'searchColumn': 'biaoti', 'searchYear': 'all', 'preSWord': 'docTitle=("行政处罚")', 'sword': '行政处罚', 'searchAgain': '', 'page': 1, 'res_wenzhong': '', 'res_wenzhonglist': '', 'wenzhong': '', 'pubwebsite': '/' + page_count_url.split('/')[-2] + '/' } response = request_site_page('http://www.csrc.gov.cn/wcm/govsearch/simp_gov_list.jsp', params=params, methods='post') if response is None: logger.error('网页请求错误') continue try: page_count = int(int(re.search(r'var m_nRecordCount = (\d+)?;', response.text).group(1).strip()) / 20 + 1) logger.info(each_xzcf_search_url_other_info['area'] + each_xzcf_search_url_other_info['type'] + '检索 一共有%d页' % page_count) except Exception as e: logger.error(e) page_count = 0 # get crawler data for i in range(page_count): logger.info( each_xzcf_search_url_other_info['area'] + each_xzcf_search_url_other_info['type'] + '检索 第%d页' % (i + 1)) params['page'] = i + 1 try: content_response = request_site_page('http://www.csrc.gov.cn/wcm/govsearch/simp_gov_list.jsp', params=params, methods='post') if content_response is None: logger.error('网页请求错误') continue content_soup = bs(content_response.content, 'lxml') if content_response else bs('', 'lxml') dl_content = content_soup.find(id='documentContainer') if not dl_content: logger.error('网页请求错误') continue for each_row in dl_content.find_all(class_='row'): try: announcement_url = urljoin(each_xzcf_search_url_other_info['url'], each_row.find('a').attrs['href']) announcement_url = announcement_url.split('?')[0].strip() if db.csrc_data.find({'url': announcement_url}).count() == 0: title = each_row.find('a').text.strip() announcement_date = each_row.find(class_='fbrq').text.strip() logger.info(each_xzcf_search_url_other_info['area'] + each_xzcf_search_url_other_info['type'] + '检索新公告:' + announcement_url) post = { 'title': title, 'publishDate': announcement_date, 'url': announcement_url, 'type': each_xzcf_search_url_other_info['type'], 'origin': each_xzcf_search_url_other_info['area'], 'status': 'not parsed' } if post not in new_local_csrc_announcement_list: new_local_csrc_announcement_list.append(post) else: if config['crawler_update_type']['update_type'] == '0': break except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue except Exception as e: logger.error(e) continue logger.info('\n') # 搜索名称中有“措施的决定”或者“行政监管措施决定书”的公告 for each_jgcs_search_url_info in jgcs_search_url_list: logger.info(each_jgcs_search_url_info['area'] + each_jgcs_search_url_info['type'] + '检索' + ' ' + each_jgcs_search_url_info['url']) # 措施的决定 # get page_count page_count_url = each_jgcs_search_url_info['url'] params = { 'SType': '1', 'searchColumn': 'biaoti', 'searchYear': 'all', 'preSWord': 'docTitle=("措施的决定")', 'sword': '措施的决定', 'searchAgain': '', 'page': 1, 'res_wenzhong': '', 'res_wenzhonglist': '', 'wenzhong': '', 'pubwebsite': '/' + page_count_url.split('/')[-2] + '/' } response = request_site_page('http://www.csrc.gov.cn/wcm/govsearch/simp_gov_list.jsp', params=params, methods='post') if response is None: logger.error('网页请求错误') continue try: page_count = int(int(re.search(r'var m_nRecordCount = (\d+)?;', response.text).group(1).strip()) / 20 + 1) logger.info( each_jgcs_search_url_info['area'] + each_jgcs_search_url_info['type'] + '检索 一共有%d页' % page_count) except Exception as e: logger.error(e) page_count = 0 # get crawler data for i in range(page_count): logger.info( each_jgcs_search_url_info['area'] + each_jgcs_search_url_info['type'] + '检索 第%d页' % (i + 1)) params['page'] = i + 1 try: content_response = request_site_page('http://www.csrc.gov.cn/wcm/govsearch/simp_gov_list.jsp', params=params, methods='post') if content_response is None: logger.error('网页请求错误') continue content_soup = bs(content_response.content, 'lxml') if content_response else bs('', 'lxml') dl_content = content_soup.find(id='documentContainer') if not dl_content: logger.error('网页请求错误') continue for each_row in dl_content.find_all(class_='row'): try: announcement_url = urljoin(each_jgcs_search_url_info['url'], each_row.find('a').attrs['href']) announcement_url = announcement_url.split('?')[0].strip() if db.csrc_data.find({'url': announcement_url}).count() == 0: title = each_row.find('a').text.strip() announcement_date = each_row.find(class_='fbrq').text.strip() logger.info(each_jgcs_search_url_info['area'] + each_jgcs_search_url_info['type'] + '检索新公告:' + announcement_url) post = { 'title': title, 'publishDate': announcement_date, 'url': announcement_url, 'type': each_jgcs_search_url_info['type'], 'origin': each_jgcs_search_url_info['area'], 'status': 'not parsed' } if post not in new_local_csrc_announcement_list: new_local_csrc_announcement_list.append(post) else: if config['crawler_update_type']['update_type'] == '0': break except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue except Exception as e: logger.error(e) continue # 行政监管措施决定书 # get page_count page_count_url = each_jgcs_search_url_info['url'] params = { 'SType': '1', 'searchColumn': 'biaoti', 'searchYear': 'all', 'preSWord': 'docTitle=("行政监管措施决定书")', 'sword': '行政监管措施决定书', 'searchAgain': '', 'page': 1, 'res_wenzhong': '', 'res_wenzhonglist': '', 'wenzhong': '', 'pubwebsite': '/' + page_count_url.split('/')[-2] + '/' } response = request_site_page('http://www.csrc.gov.cn/wcm/govsearch/simp_gov_list.jsp', params=params, methods='post') try: page_count = int(int(re.search(r'var m_nRecordCount = (\d+)?;', response.text).group(1).strip()) / 20 + 1) logger.info( each_jgcs_search_url_info['area'] + each_jgcs_search_url_info['type'] + '检索 一共有%d页' % page_count) except Exception as e: logger.error(e) page_count = 0 # get crawler data for i in range(page_count): logger.info( each_jgcs_search_url_info['area'] + each_jgcs_search_url_info['type'] + '检索 第%d页' % (i + 1)) params['page'] = i + 1 try: content_response = request_site_page('http://www.csrc.gov.cn/wcm/govsearch/simp_gov_list.jsp', params=params, methods='post') content_soup = bs(content_response.content, 'lxml') if content_response else bs('', 'lxml') dl_content = content_soup.find(id='documentContainer') for each_row in dl_content.find_all(class_='row'): try: announcement_url = urljoin(each_jgcs_search_url_info['url'], each_row.find('a').attrs['href']) announcement_url = announcement_url.split('?')[0].strip() if db.csrc_data.find({'url': announcement_url}).count() == 0: title = each_row.find('a').text.strip() announcement_date = each_row.find(class_='fbrq').text.strip() logger.info(each_jgcs_search_url_info['area'] + each_jgcs_search_url_info['type'] + '检索新公告:' + announcement_url) post = { 'title': title, 'publishDate': announcement_date, 'url': announcement_url, 'type': each_jgcs_search_url_info['type'], 'origin': each_jgcs_search_url_info['area'], 'status': 'not parsed' } if post not in new_local_csrc_announcement_list: new_local_csrc_announcement_list.append(post) else: if config['crawler_update_type']['update_type'] == '0': break except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue except Exception as e: logger.error(e) continue logger.info('\n') if len(new_local_csrc_announcement_list) > 0: logger.info('地方证监局一共有%d条新公告,导入数据库中......' % len(new_local_csrc_announcement_list)) r = db.csrc_data.insert_many(new_local_csrc_announcement_list) if len(r.inserted_ids) == len(new_local_csrc_announcement_list): logger.info('地方证监局公告导入完成!') else: logger.error('地方证监局公告导入出现问题!') else: logger.info('地方证监局没有新公告!')
def crawler(): result_list = [] page_num = 1 url = first_url.format(str(page_num)) response = request_site_page(url) response.encoding = response.apparent_encoding stop_flag = False if response is None: logger.error('网页请求错误{}'.format(url)) soup = bs(response.content if response else '', 'lxml') if db.crawler.find({'url': first_url}).count() > 0: last_updated_url = db.crawler.find_one({'url': first_url})['last_updated'] else: last_updated_url = '' page_count = re.findall('共(\d+)页', soup.text)[-1] while page_num <= int(page_count): try: data_list = soup.find(attrs={ "class": "publicity_table" }).find_all('tr') for index, each_data in enumerate(data_list): href = each_data.find('a')['href'] anc_url = urljoin(url, href) if anc_url == last_updated_url: stop_flag = True logger.info('到达上次爬取的链接') break if page_num == 1 and index == 0: if db.crawler.find({'url': first_url}).count() > 0: if db.crawler.find_one({'url': first_url })['last_updated'] != anc_url: db.crawler.update_one( {'url': first_url}, {'$set': { 'last_updated': anc_url }}) else: db.crawler.insert_one({ 'url': first_url, 'last_updated': anc_url, 'origin': gov_name }) title = each_data.find('a')['title'].strip() publish_date = each_data.find_all('td')[-1].text.strip() if db[collection_name].count_documents({'url': anc_url}) == 0: info = { 'title': title, 'publishDate': publish_date, 'url': anc_url, 'type': '行政处罚决定', 'origin': gov_name, 'status': 'not parsed' } logger.info('{} 新公告:{} url: {}'.format( gov_name, info['title'], anc_url)) if info not in result_list: result_list.append(info) else: if config['crawler_update_type']['update_type'] == '0': break page_num += 1 url = first_url.format(str(page_num)) response = request_site_page(url) response.encoding = response.apparent_encoding soup = bs(response.content if response else '', 'lxml') if stop_flag: logger.info('到达上次爬取的链接') break except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue response = request_site_page(second_url, params=params, methods='post') response.encoding = response.apparent_encoding if response is None: logger.error('网页请求错误{}'.format(second_url)) for each_area in area_list: stop_flag = False if db.crawler.find({'url': each_area + ' ' + second_url}).count() > 0: last_updated_url = db.crawler.find_one( {'url': each_area + ' ' + second_url})['last_updated'] else: last_updated_url = '' try: params['searhvalue'] = each_area response = request_site_page(second_url, params=params, methods='post') response.encoding = response.apparent_encoding soup = bs(response.content if response else '', 'lxml') data_list = soup.find_all('tr') for index, each_data in enumerate(data_list): href = each_data.find('a')['href'] anc_url = urljoin(second_url, href) if anc_url == last_updated_url: stop_flag = True logger.info('到达上次爬取的链接') break if index == 0: if db.crawler.find({ 'url': each_area + ' ' + second_url }).count() > 0: if db.crawler.find_one( {'url': each_area + ' ' + second_url })['last_updated'] != anc_url: db.crawler.update_one( {'url': each_area + ' ' + second_url}, {'$set': { 'last_updated': anc_url }}) else: db.crawler.insert_one({ 'url': each_area + ' ' + second_url, 'last_updated': anc_url, 'origin': gov_name }) title = each_data.find('a').text.strip() publish_date = re.search('(\d+/\d+/\d+)', href).group(1).strip().replace( '/', '-') if db[collection_name].count_documents({'url': anc_url}) == 0: info = { 'title': title, 'publishDate': publish_date, 'url': anc_url, 'type': '行政处罚决定', 'origin': gov_name, 'status': 'not parsed' } logger.info('{} 新公告:{} url: {}'.format( gov_name, info['title'], anc_url)) if info not in result_list: result_list.append(info) else: if config['crawler_update_type']['update_type'] == '0': break except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') continue response = request_site_page(third_url) response.encoding = response.apparent_encoding stop_flag = False if response is None: logger.error('网页请求错误{}'.format(third_url)) XZCF_url = 'http://anhui.chinatax.gov.cn/module/xxgk/subjectinfo.jsp?showsub=1' data_list = [{ 'infotypeId': 0, 'jdid': '39', 'nZtflid': 40, 'vc_bm': '0702', 'area': '11340000002986061N', 'strSearchUrl': '/module/xxgk/subjectinfo.jsp' }] for item in item_list: if db.crawler.find({'url': item + third_url}).count() > 0: last_updated_url = db.crawler.find_one({'url': item + third_url })['last_updated'] else: last_updated_url = '' response = request_site_page(XZCF_url, params=data_list[0], methods='post') response.encoding = response.apparent_encoding soup = bs(response.text, 'lxml') tr_list = soup.find('table', style='border-top:none').find_all('tr') del (tr_list[0]) for index, each_tr in enumerate(tr_list): try: href = each_tr.find('a')['href'] anc_url = urljoin(third_url, href) if anc_url == last_updated_url: stop_flag = True logger.info('到达上次爬取的链接') break if index == 0: if db.crawler.find({'url': item + third_url}).count() > 0: if db.crawler.find_one({'url': item + third_url })['last_updated'] != anc_url: db.crawler.update_one( {'url': item + third_url}, {'$set': { 'last_updated': anc_url }}) else: db.crawler.insert_one({ 'url': item + third_url, 'last_updated': anc_url, 'origin': gov_name }) title = each_tr.find('a').text.strip() publish_date = each_tr.find('td', align='center').text.strip() if db[collection_name].count_documents({'url': anc_url}) == 0: info = { 'title': title, 'publishDate': publish_date, 'url': anc_url, 'type': '行政处罚决定', 'origin': gov_name, 'status': 'not parsed' } logger.info('{} 新公告:{} url: {}'.format( gov_name, info['title'], anc_url)) if info not in result_list: result_list.append(info) else: if config['crawler_update_type']['update_type'] == '0': break except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') url = forth_url.format('1') response = request_site_page(url) response.encoding = response.apparent_encoding stop_flag = False if response is None: logger.error('网页请求错误{}'.format(url)) soup = bs(response.content if response else '', 'lxml') if db.crawler.find({'url': forth_url}).count() > 0: last_updated_url = db.crawler.find_one({'url': forth_url})['last_updated'] else: last_updated_url = '' page_count = re.findall('共(\d+)页', soup.text)[-1] page_num = 1 while page_num <= int(page_count): try: data_list = soup.find(attrs={ "class": "publicity_table" }).find_all('tr') for index, each_data in enumerate(data_list): href = each_data.find_all('a')[-1]['href'] anc_url = urljoin(url, href) if anc_url == last_updated_url: stop_flag = True logger.info('到达上次爬取的链接') break if index == 0 and page_num == 1: if db.crawler.find({'url': forth_url}).count() > 0: if db.crawler.find_one({'url': forth_url })['last_updated'] != anc_url: db.crawler.update_one( {'url': forth_url}, {'$set': { 'last_updated': anc_url }}) else: db.crawler.insert_one({ 'url': forth_url, 'last_updated': anc_url, 'origin': gov_name }) title = each_data.find_all('a')[-1]['title'] publish_date = each_data.find_all('td')[-1].text.strip() if db[collection_name].count_documents({'url': anc_url}) == 0: info = { 'title': title, 'publishDate': publish_date, 'url': anc_url, 'type': '行政处罚决定', 'origin': gov_name, 'status': 'not parsed' } logger.info('{} 新公告:{} url: {}'.format( gov_name, info['title'], anc_url)) if info not in result_list: result_list.append(info) else: if config['crawler_update_type']['update_type'] == '0': break page_num += 1 url = forth_url.format(str(page_num)) response = request_site_page(url) response.encoding = response.apparent_encoding soup = bs(response.content if response else '', 'lxml') if stop_flag: logger.info('到达上次爬取的链接') break except Exception as e: logger.error(e) logger.warning('提取公告url出现问题') if not each_data.find('a'): logger.info('公告已提取完毕') break continue if len(result_list) > 0: logger.info('{}一共有{}条新公告,导入数据库中......'.format(gov_name, len(result_list))) r = db[collection_name].insert_many(result_list) if len(r.inserted_ids) == len(result_list): logger.info('{}公告导入完成!'.format(gov_name)) else: logger.error('{}公告导入出现问题!'.format(gov_name)) else: logger.info('{}没有新公告!'.format(gov_name))