class Crawler: def __init__(self): self.session = requests.session() self.session.headers.update(headers) self.db = DB() self.crawl_timestamp = int() self.url = "https://3g.dxy.cn/newh5/view/pneumonia" def run(self): while True: self.crawler() time.sleep(60) def crawler(self): while True: self.crawl_timestamp = int( datetime.datetime.timestamp(datetime.datetime.now()) * 1000) r = self.session.get(url=self.url) soup = BeautifulSoup(r.content, 'lxml') overall_information = re.search( r'\{("id".*?)\}', str(soup.find('script', attrs={'id': 'getStatisticsService'}))) province_information = re.search( r'\[(.*?)\]', str( soup.find('script', attrs={'id': 'getListByCountryTypeService1'}))) area_information = re.search( r'\[(.*)\]', str(soup.find('script', attrs={'id': 'getAreaStat'}))) abroad_information = re.search( r'\[(.*)\]', str( soup.find('script', attrs={'id': 'getListByCountryTypeService2'}))) news = re.search( r'\[(.*?)\]', str(soup.find('script', attrs={'id': 'getTimelineService'}))) if not overall_information or not province_information or not area_information or not news: continue self.overall_parser(overall_information=overall_information) self.province_parser(province_information=province_information) self.area_parser(area_information=area_information) self.abroad_parser(abroad_information=abroad_information) self.news_parser(news=news) break logger.info('Successfully crawled.') def overall_parser(self, overall_information): overall_information = json.loads(overall_information.group(0)) overall_information.pop('id') overall_information.pop('createTime') overall_information.pop('modifyTime') overall_information.pop('imgUrl') overall_information.pop('deleted') overall_information['countRemark'] = overall_information[ 'countRemark'].replace(' 疑似', ',疑似').replace(' 治愈', ',治愈').replace( ' 死亡', ',死亡').replace(' ', '') if not self.db.find_one(collection='DXYOverall', data=overall_information): overall_information['updateTime'] = self.crawl_timestamp overall_information = regex_parser(content=overall_information, key='countRemark') self.db.insert(collection='DXYOverall', data=overall_information) def province_parser(self, province_information): provinces = json.loads(province_information.group(0)) for province in provinces: province.pop('id') province.pop('tags') province.pop('sort') province['comment'] = province['comment'].replace(' ', '') if self.db.find_one(collection='DXYProvince', data=province): continue province['crawlTime'] = self.crawl_timestamp province['country'] = country_type.get(province['countryType']) self.db.insert(collection='DXYProvince', data=province) def area_parser(self, area_information): area_information = json.loads(area_information.group(0)) for area in area_information: area['comment'] = area['comment'].replace(' ', '') if self.db.find_one(collection='DXYArea', data=area): continue area['country'] = '中国' area['updateTime'] = self.crawl_timestamp self.db.insert(collection='DXYArea', data=area) def abroad_parser(self, abroad_information): countries = json.loads(abroad_information.group(0)) for country in countries: country.pop('id') country.pop('tags') country.pop('countryType') country.pop('provinceId') country['country'] = country.get('provinceName') country.pop('provinceShortName') country.pop('cityName') country.pop('sort') country['comment'] = country['comment'].replace(' ', '') if self.db.find_one(collection='DXYArea', data=country): continue country['updateTime'] = self.crawl_timestamp self.db.insert(collection='DXYArea', data=country) def news_parser(self, news): news = json.loads(news.group(0)) for _news in news: _news.pop('pubDateStr') if self.db.find_one(collection='DXYNews', data=_news): continue _news['crawlTime'] = self.crawl_timestamp self.db.insert(collection='DXYNews', data=_news)
class Crawler: def __init__(self): self.session = requests.session() self.session.headers.update(headers) self.db = DB() self.crawl_timestamp = int() def run(self): while True: self.crawler() time.sleep(300) def crawler(self): while True: self.crawl_timestamp = int( datetime.datetime.timestamp(datetime.datetime.now()) * 1000) try: r = self.session.get( url='https://3g.dxy.cn/newh5/view/pneumonia') except requests.exceptions.ChunkedEncodingError: continue except requests.exceptions.ConnectionError: logger.warn("Server Disconnected.") break soup = BeautifulSoup(r.content, 'lxml') overall_information = re.search( r'\{("id".*?)\]\}', str(soup.find('script', attrs={'id': 'getStatisticsService'}))) province_information = re.search( r'\[(.*?)\]', str( soup.find('script', attrs={'id': 'getListByCountryTypeService1'}))) area_information = re.search( r'\[(.*)\]', str(soup.find('script', attrs={'id': 'getAreaStat'}))) abroad_information = re.search( r'\[(.*)\]', str( soup.find('script', attrs={'id': 'getListByCountryTypeService2'}))) news = re.search( r'\[(.*?)\]', str(soup.find('script', attrs={'id': 'getTimelineService'}))) if not overall_information or not province_information or not area_information or not news: continue self.overall_parser(overall_information=overall_information) self.province_parser(province_information=province_information) self.area_parser(area_information=area_information) self.abroad_parser(abroad_information=abroad_information) self.news_parser(news=news) break while True: self.crawl_timestamp = int( datetime.datetime.timestamp(datetime.datetime.now()) * 1000) try: r = self.session.get( url= 'https://file1.dxycdn.com/2020/0127/797/3393185293879908067-115.json' ) except requests.exceptions.ChunkedEncodingError: continue # Use try-except to ensure the .json() method will not raise exception. try: if r.status_code != 200: continue elif r.json().get('code') == 'success': self.rumor_parser(rumors=r.json().get('data')) break else: continue except json.decoder.JSONDecodeError: continue logger.info('Successfully crawled.') def overall_parser(self, overall_information): overall_information = json.loads(overall_information.group(0)) overall_information.pop('id') overall_information.pop('createTime') overall_information.pop('modifyTime') overall_information.pop('imgUrl') overall_information.pop('deleted') overall_information['countRemark'] = overall_information[ 'countRemark'].replace(' 疑似', ',疑似').replace(' 治愈', ',治愈').replace( ' 死亡', ',死亡').replace(' ', '') if not self.db.find_one(collection='DXYOverall', data=overall_information): overall_information['updateTime'] = self.crawl_timestamp self.db.insert(collection='DXYOverall', data=overall_information) def province_parser(self, province_information): provinces = json.loads(province_information.group(0)) for province in provinces: province.pop('id') province.pop('tags') province.pop('sort') province['comment'] = province['comment'].replace(' ', '') if self.db.find_one(collection='DXYProvince', data=province): continue province['provinceEnglishName'] = city_name_map[ province['provinceShortName']]['engName'] province['crawlTime'] = self.crawl_timestamp province['country'] = country_type_map.get(province['countryType']) self.db.insert(collection='DXYProvince', data=province) def area_parser(self, area_information): area_information = json.loads(area_information.group(0)) for area in area_information: area['comment'] = area['comment'].replace(' ', '') # Because the cities are given other attributes, # this part should not be used when checking the identical document. cities_backup = area.pop('cities') if self.db.find_one(collection='DXYArea', data=area): continue # If this document is not in current database, insert this attribute back to the document. area['cities'] = cities_backup area['countryName'] = '中国' area['countryEnglishName'] = 'China' area['continentName'] = '亚洲' area['continentEnglishName'] = 'Asia' area['provinceEnglishName'] = city_name_map[ area['provinceShortName']]['engName'] for city in area['cities']: if city['cityName'] != '待明确地区': try: city['cityEnglishName'] = city_name_map[area[ 'provinceShortName']]['cities'][city['cityName']] except KeyError: print(area['provinceShortName'], city['cityName']) pass else: city['cityEnglishName'] = 'Area not defined' area['updateTime'] = self.crawl_timestamp self.db.insert(collection='DXYArea', data=area) def abroad_parser(self, abroad_information): countries = json.loads(abroad_information.group(0)) for country in countries: country.pop('id') country.pop('tags') country.pop('countryType') country.pop('provinceId') country.pop('cityName') country.pop('sort') # The original provinceShortName are blank string country.pop('provinceShortName') # Rename the key continents to continentName country['continentName'] = country.pop('continents') # Ding Xiang Yuan have a large number of duplicates, # values are all the same, but the modifyTime are different. # I suppose the modifyTime is modification time for all documents, other than for only this document. # So this field will be popped out. country.pop('modifyTime') # createTime is also different even if the values are same. # Originally, the createTime represent the first diagnosis of the virus in this area, # but it seems different for abroad information. country.pop('createTime') country['comment'] = country['comment'].replace(' ', '') if self.db.find_one(collection='DXYArea', data=country): continue country['countryName'] = country.get('provinceName') country['provinceShortName'] = country.get('provinceName') country['continentEnglishName'] = continent_name_map.get( country['continentName']) country['countryEnglishName'] = country_name_map.get( country['countryName']) country['provinceEnglishName'] = country_name_map.get( country['countryName']) country['updateTime'] = self.crawl_timestamp self.db.insert(collection='DXYArea', data=country) def news_parser(self, news): news = json.loads(news.group(0)) for _news in news: _news.pop('pubDateStr') if self.db.find_one(collection='DXYNews', data=_news): continue _news['crawlTime'] = self.crawl_timestamp self.db.insert(collection='DXYNews', data=_news) def rumor_parser(self, rumors): for rumor in rumors: rumor.pop('score') rumor['body'] = rumor['body'].replace(' ', '') if self.db.find_one(collection='DXYRumors', data=rumor): continue rumor['crawlTime'] = self.crawl_timestamp self.db.insert(collection='DXYRumors', data=rumor)
class Crawler: def __init__(self): self.session = requests.session() self.session.headers.update(headers) self.db = DB() self.crawl_timestamp = int() self.url = "https://3g.dxy.cn/newh5/view/pneumonia" self.rumor_url = "https://file1.dxycdn.com/2020/0127/797/3393185293879908067-115.json" self.overall_count = 0 self.province_count = 0 self.area_count = 0 self.news_count = 0 self.rumor_count = 0 def run(self): while True: self.crawl() time.sleep(60) def crawl(self): # reset counters self.overall_count = 0 self.province_count = 0 self.area_count = 0 self.news_count = 0 self.rumor_count = 0 self.crawl_timestamp = int(datetime.datetime.timestamp(datetime.datetime.now()) * 1000) r = self.session.get(url=self.url) soup = BeautifulSoup(r.content, 'lxml') overall_information = re.search(r'\{("id".*?)\}', str(soup.find('script', attrs={'id': 'getStatisticsService'}))) province_information = re.search(r'\[(.*?)\]', str(soup.find('script', attrs={'id': 'getListByCountryTypeService1'}))) area_information = re.search(r'\[(.*)\]', str(soup.find('script', attrs={'id': 'getAreaStat'}))) abroad_information = re.search(r'\[(.*)\]', str(soup.find('script', attrs={'id': 'getListByCountryTypeService2'}))) news = re.search(r'\[(.*?)\]', str(soup.find('script', attrs={'id': 'getTimelineService'}))) rumor_resp = self.session.get(url=self.rumor_url + '?t=' + str(self.crawl_timestamp)) if rumor_resp.status_code == 200: rumor_information = rumor_resp.json() else: logger.warning("Failed get rumor json. status code: %s, reason: %s." % (rumor_resp.status_code, rumor_resp.reason)) rumor_information = None if overall_information or province_information or area_information or news or rumor_information: self.overall_parser(overall_information=overall_information) self.province_parser(province_information=province_information) self.area_parser(area_information=area_information) self.abroad_parser(abroad_information=abroad_information) self.news_parser(news=news) self.rumor_parser(rumor_information=rumor_information) logger.info('Successfully crawled. Added %d overall, %d province, %d area, %d news, %d rumor.' % (self.overall_count, self.province_count, self.area_count, self.news_count, self.rumor_count)) def overall_parser(self, overall_information): if overall_information is None: return overall_information = json.loads(overall_information.group(0)) overall_information.pop('id') overall_information.pop('createTime') overall_information.pop('modifyTime') overall_information.pop('imgUrl') overall_information.pop('deleted') overall_information['countRemark'] = overall_information['countRemark'].replace(' 疑似', ',疑似').replace(' 治愈', ',治愈').replace( ' 死亡', ',死亡').replace(' ', '') if not self.db.find_one(collection='DXYOverall', data=overall_information): overall_information['updateTime'] = self.crawl_timestamp overall_information = regex_parser(content=overall_information, key='countRemark') self.overall_count += 1 self.db.insert(collection='DXYOverall', data=overall_information) def province_parser(self, province_information): if province_information is None: return provinces = json.loads(province_information.group(0)) for province in provinces: province.pop('id') province.pop('tags') province.pop('sort') province['comment'] = province['comment'].replace(' ', '') if self.db.find_one(collection='DXYProvince', data=province): continue province['crawlTime'] = self.crawl_timestamp province['country'] = country_type.get(province['countryType']) self.province_count += 1 self.db.insert(collection='DXYProvince', data=province) def area_parser(self, area_information): if area_information is None: return area_information = json.loads(area_information.group(0)) for area in area_information: area['comment'] = area['comment'].replace(' ', '') if self.db.find_one(collection='DXYArea', data=area): continue area['country'] = '中国' area['updateTime'] = self.crawl_timestamp self.area_count += 1 self.db.insert(collection='DXYArea', data=area) def abroad_parser(self, abroad_information): if abroad_information is None: return countries = json.loads(abroad_information.group(0)) for country in countries: country.pop('id') country.pop('tags') country.pop('countryType') country.pop('provinceId') country['country'] = country.get('provinceName') country['provinceShortName'] = country.get('provinceName') country.pop('cityName') country.pop('sort') country['comment'] = country['comment'].replace(' ', '') if self.db.find_one(collection='DXYArea', data=country): continue country['updateTime'] = self.crawl_timestamp self.area_count += 1 self.db.insert(collection='DXYArea', data=country) def news_parser(self, news): if news is None: return news = json.loads(news.group(0)) for _news in news: _news.pop('pubDateStr') if self.db.find_one(collection='DXYNews', data=_news): continue _news['crawlTime'] = self.crawl_timestamp self.news_count += 1 self.db.insert(collection='DXYNews', data=_news) def rumor_parser(self, rumor_information): if rumor_information is None: return rumor = rumor_information['data'] for _rumor in rumor: if self.db.find_one(collection='DXYRumor', data=_rumor): continue _rumor['crawlTime'] = self.crawl_timestamp self.rumor_count += 1 self.db.insert(collection='DXYRumor', data=_rumor)
class Crawler: def __init__(self): self.session = requests.session() self.db = DB() self.crawl_timestamp = int() def run(self): while True: self.crawler() time.sleep(60) def crawler(self): while True: self.session.headers.update( {'user-agent': random.choice(user_agent_list)}) self.crawl_timestamp = int(time.time() * 1000) try: r = self.session.get( url='https://ncov.dxy.cn/ncovh5/view/pneumonia') except requests.exceptions.ChunkedEncodingError: continue # soup = BeautifulSoup(r.content, 'lxml') soup = BeautifulSoup(r.content, 'html.parser') overall_information = re.search( r'(\{"id".*\})\}', str(soup.find('script', attrs={'id': 'getStatisticsService'}))) if overall_information: self.overall_parser(overall_information=overall_information) area_information = re.search( r'\[(.*)\]', str(soup.find('script', attrs={'id': 'getAreaStat'}))) if area_information: self.area_parser(area_information=area_information) abroad_information = re.search( r'\[(.*)\]', str( soup.find('script', attrs={'id': 'getListByCountryTypeService2true'}))) if abroad_information: self.abroad_parser(abroad_information=abroad_information) news_chinese = re.search( r'\[(.*?)\]', str(soup.find('script', attrs={'id': 'getTimelineService1'}))) if news_chinese: self.news_parser(news=news_chinese) news_english = re.search( r'\[(.*?)\]', str(soup.find('script', attrs={'id': 'getTimelineService2'}))) if news_english: self.news_parser(news=news_english) rumors = re.search( r'\[(.*?)\]', str(soup.find('script', attrs={'id': 'getIndexRumorList'}))) if rumors: self.rumor_parser(rumors=rumors) if not overall_information or \ not area_information or \ not abroad_information or \ not news_chinese or \ not news_english or \ not rumors: time.sleep(3) continue break logger.info('Successfully crawled.') def overall_parser(self, overall_information): overall_information = json.loads(overall_information.group(1)) overall_information.pop('id') overall_information.pop('createTime') overall_information.pop('modifyTime') overall_information.pop('imgUrl') overall_information.pop('deleted') overall_information['countRemark'] = overall_information[ 'countRemark'].replace(' 疑似', ',疑似').replace(' 治愈', ',治愈').replace( ' 死亡', ',死亡').replace(' ', '') if not self.db.find_one(collection='DXYOverall', data=overall_information): overall_information['updateTime'] = self.crawl_timestamp self.db.insert(collection='DXYOverall', data=overall_information) def province_parser(self, province_information): provinces = json.loads(province_information.group(0)) for province in provinces: province.pop('id') province.pop('tags') province.pop('sort') province['comment'] = province['comment'].replace(' ', '') if self.db.find_one(collection='DXYProvince', data=province): continue province['provinceEnglishName'] = city_name_map[ province['provinceShortName']]['engName'] province['crawlTime'] = self.crawl_timestamp province['country'] = country_type_map.get(province['countryType']) self.db.insert(collection='DXYProvince', data=province) def area_parser(self, area_information): area_information = json.loads(area_information.group(0)) for area in area_information: area['comment'] = area['comment'].replace(' ', '') # Because the cities are given other attributes, # this part should not be used when checking the identical document. cities_backup = area.pop('cities') if self.db.find_one(collection='DXYArea', data=area): continue # If this document is not in current database, insert this attribute back to the document. area['cities'] = cities_backup area['countryName'] = '中国' area['countryEnglishName'] = 'China' area['continentName'] = '亚洲' area['continentEnglishName'] = 'Asia' area['provinceEnglishName'] = city_name_map[ area['provinceShortName']]['engName'] for city in area['cities']: if city['cityName'] != '待明确地区': try: city['cityEnglishName'] = city_name_map[area[ 'provinceShortName']]['cities'][city['cityName']] except KeyError: print(area['provinceShortName'], city['cityName']) pass else: city['cityEnglishName'] = 'Area not defined' area['updateTime'] = self.crawl_timestamp self.db.insert(collection='DXYArea', data=area) def abroad_parser(self, abroad_information): countries = json.loads(abroad_information.group(0)) for country in countries: try: country.pop('id') country.pop('tags') country.pop('sort') # Ding Xiang Yuan have a large number of duplicates, # values are all the same, but the modifyTime are different. # I suppose the modifyTime is modification time for all documents, other than for only this document. # So this field will be popped out. country.pop('modifyTime') # createTime is also different even if the values are same. # Originally, the createTime represent the first diagnosis of the virus in this area, # but it seems different for abroad information. country.pop('createTime') country['comment'] = country['comment'].replace(' ', '') except KeyError: pass country.pop('countryType') country.pop('provinceId') country.pop('cityName') # The original provinceShortName are blank string country.pop('provinceShortName') # Rename the key continents to continentName country['continentName'] = country.pop('continents') if self.db.find_one(collection='DXYArea', data=country): continue country['countryName'] = country.get('provinceName') country['provinceShortName'] = country.get('provinceName') country['continentEnglishName'] = continent_name_map.get( country['continentName']) country['countryEnglishName'] = country_name_map.get( country['countryName']) country['provinceEnglishName'] = country_name_map.get( country['countryName']) country['updateTime'] = self.crawl_timestamp self.db.insert(collection='DXYArea', data=country) def news_parser(self, news): news = json.loads(news.group(0)) for _news in news: _news.pop('pubDateStr') if self.db.find_one(collection='DXYNews', data=_news): continue _news['crawlTime'] = self.crawl_timestamp self.db.insert(collection='DXYNews', data=_news) def rumor_parser(self, rumors): rumors = json.loads(rumors.group(0)) for rumor in rumors: rumor.pop('score') rumor['body'] = rumor['body'].replace(' ', '') if self.db.find_one(collection='DXYRumors', data=rumor): continue rumor['crawlTime'] = self.crawl_timestamp self.db.insert(collection='DXYRumors', data=rumor)
class Crawler: def __init__(self): self.session = requests.session() self.session.headers.update(headers) self.db = DB() self.crawl_timestamp = int() def run(self): while True: self.crawler() time.sleep(1800) def crawler(self): while True: self.crawl_timestamp = int(datetime.datetime.timestamp(datetime.datetime.now()) * 1000) try: r = self.session.get(url='https://3g.dxy.cn/newh5/view/pneumonia') except requests.exceptions.ChunkedEncodingError: continue soup = BeautifulSoup(r.content, 'lxml') # overall_information = re.search(r'\{("id".*?)\]\}',str(soup.find('script', attrs={'id': 'getStatisticsService'}))) # province_information = re.search(r'\[(.*?)\]', str(soup.find('script', attrs={'id': 'getListByCountryTypeService1'}))) area_information = re.search(r'\[(.*)\]', str(soup.find('script', attrs={'id': 'getAreaStat'}))) # abroad_information = re.search(r'\[(.*)\]', str(soup.find('script', attrs={'id': 'getListByCountryTypeService2'}))) # news = re.search(r'\[(.*?)\]', str(soup.find('script', attrs={'id': 'getTimelineService'}))) if not area_information: continue # self.overall_parser(overall_information=overall_information) # self.province_parser(province_information=province_information) self.area_parser(area_information=area_information) # if abroad_information is not None: # self.abroad_parser(abroad_information=abroad_information) # self.news_parser(news=news) break while True: self.crawl_timestamp = int(datetime.datetime.timestamp(datetime.datetime.now()) * 1000) try: r = self.session.get(url='https://file1.dxycdn.com/2020/0127/797/3393185293879908067-115.json') except requests.exceptions.ChunkedEncodingError: continue # Use try-except to ensure the .json() method will not raise exception. try: if r.status_code != 200: continue elif r.json().get('code') == 'success': self.rumor_parser(rumors=r.json().get('data')) break else: continue except json.decoder.JSONDecodeError: continue logger.info('Successfully crawled.') def overall_parser(self, overall_information): overall_information = json.loads(overall_information.group(0)) overall_information.pop('id') overall_information.pop('createTime') overall_information.pop('modifyTime') overall_information.pop('imgUrl') overall_information.pop('deleted') overall_information['countRemark'] = overall_information['countRemark'].replace(' 疑似', ',疑似').replace(' 治愈', ',治愈').replace( ' 死亡', ',死亡').replace(' ', '') if not self.db.find_one(collection='DXYOverall', data=overall_information): overall_information['updateTime'] = self.crawl_timestamp self.db.insert(collection='DXYOverall', data=overall_information) def province_parser(self, province_information): provinces = json.loads(province_information.group(0)) for province in provinces: province.pop('id') province.pop('tags') province.pop('sort') province['comment'] = province['comment'].replace(' ', '') if self.db.find_one(collection='DXYProvince', data=province): continue province['crawlTime'] = self.crawl_timestamp province['country'] = country_type.get(province['countryType']) self.db.insert(collection='DXYProvince', data=province) def area_parser(self, area_information): area_information = json.loads(area_information.group(0)) for area in area_information: area['comment'] = area['comment'].replace(' ', '') # 检查数据是否有变更,无变更则跳过 if self.db.find_one(collection='DXYArea', data=area): continue # 数据库存在对应省份数据则更新 area_in_mongo = self.db.find_one(collection='DXYArea', data={'provinceShortName': area['provinceShortName']}) if area_in_mongo is not None: self.db.update_one(collection='DXYArea', query={'_id': area_in_mongo['_id']}, data_after={ '$set': {'confirmedCount': area['confirmedCount'], 'suspectedCount': area['suspectedCount'], 'currentConfirmedCount': area['currentConfirmedCount'], 'curedCount': area['curedCount'], 'deadCount': area['deadCount'], 'cities': area['cities'], 'updateTime': self.crawl_timestamp}}) logger.info(area['provinceShortName'] + "进行数据更新") continue # 不存在则插入 area['country'] = '中国' area['updateTime'] = self.crawl_timestamp self.db.insert(collection='DXYArea', data=area) logger.info(area['provinceShortName'] + "进行数据新增") def abroad_parser(self, abroad_information): countries = json.loads(abroad_information.group(0)) for country in countries: country.pop('id') country.pop('tags') country.pop('countryType') country.pop('provinceId') country['country'] = country.get('provinceName') country['provinceShortName'] = country.get('provinceName') country.pop('cityName') country.pop('sort') country['comment'] = country['comment'].replace(' ', '') if self.db.find_one(collection='DXYArea', data=country): continue country['updateTime'] = self.crawl_timestamp self.db.insert(collection='DXYArea', data=country) def news_parser(self, news): news = json.loads(news.group(0)) for _news in news: _news.pop('pubDateStr') if self.db.find_one(collection='DXYNews', data=_news): continue _news['crawlTime'] = self.crawl_timestamp self.db.insert(collection='DXYNews', data=_news) def rumor_parser(self, rumors): for rumor in rumors: rumor.pop('score') rumor['body'] = rumor['body'].replace(' ', '') if self.db.find_one(collection='DXYRumors', data=rumor): continue rumor['crawlTime'] = self.crawl_timestamp self.db.insert(collection='DXYRumors', data=rumor)
class Crawler: def __init__(self): self.session = requests.session() self.db = DB() self.crawl_timestamp = int() def run(self): # because I need run the crawler in Java, don't nead regularly perform here #while True: self.crawler() #time.sleep(60*30) #time internal 30min def crawler(self): while True: self.session.headers.update( { 'user-agent': random.choice(user_agent_list) } ) self.crawl_timestamp = int(time.time() * 1000) try: r = self.session.get(url='https://ncov.dxy.cn/ncovh5/view/pneumonia') except requests.exceptions.ChunkedEncodingError: continue soup = BeautifulSoup(r.content, 'lxml') area_information = re.search(r'\[(.*)\]', str(soup.find('script', attrs={'id': 'getAreaStat'}))) if area_information: self.area_parser(area_information=area_information) if not area_information: time.sleep(3) continue break logger.info('Successfully crawled.') def area_parser(self, area_information): area_information = json.loads(area_information.group(0)) self.db.connect_db() self.db.create_db_table() for area in area_information: # used to hold each record rowdata = {} cities_backup = area.pop('cities') area['cities'] = cities_backup area['countryName'] = '中国' area['countryEnglishName'] = 'China' area['continentName'] = '亚洲' area['continentEnglishName'] = 'Asia' area['provinceEnglishName'] = city_name_map[area['provinceShortName']]['engName'] area['updateTime'] = self.crawl_timestamp # add data to rowdata rowdata['updateTime'] = area['updateTime'] rowdata['provinceName'] = area['provinceName'] # If cities is not empty, parse every city in it. if cities_backup: for city in area['cities']: if city['cityName'] != '待明确地区': try: city['cityEnglishName'] = city_name_map[area['provinceShortName']]['cities'][city['cityName']] rowdata['cityName'] = city['cityName'] rowdata['currentConfirmedCount'] = city['currentConfirmedCount'] rowdata['confirmedCount'] = city['confirmedCount'] rowdata['suspectedCount'] = city['suspectedCount'] rowdata['curedCount'] = city['curedCount'] rowdata['deadCount'] = city['deadCount'] rowdata['locationId'] = city['locationId'] # add data into database self.db.insert(rowdata) except KeyError: print(area['provinceShortName'], city['cityName']) pass else: city['cityEnglishName'] = 'Area not defined' else: try: rowdata['cityName'] = area['provinceName'] rowdata['currentConfirmedCount'] = area['currentConfirmedCount'] rowdata['confirmedCount'] = area['confirmedCount'] rowdata['suspectedCount'] = area['suspectedCount'] rowdata['curedCount'] = area['curedCount'] rowdata['deadCount'] = area['deadCount'] rowdata['locationId'] = area['locationId'] self.db.insert(rowdata) except KeyError: print("No such key") pass self.db.close_db_connection()
class Crawler: def __init__(self): self.session = requests.session() self.session.headers.update({"user-agent": ua.random}) self.db = DB() self.crawl_timestamp = int() def run(self): while True: self.tencent_crawler() self.dxy_crawler() # self.location_crawler() logger.info('所有数据爬取完毕,开始沉睡1小时') time.sleep(3600) logger.info('沉睡结束') def history_data_crawler(self): while True: try: overall = self.session.get( url="https://lab.isaaclin.cn/nCoV/api/overall?latest=0") area = self.session.get( url="https://lab.isaaclin.cn/nCoV/api/area") except requests.exceptions.ChunkedEncodingError: self.session.headers.update({"user-agent": ua.random}) continue history_overall = json.loads(overall.text)['results'] history_area = json.loads(area.text)['results'] for i in history_overall: self.overall_parser(overall_information=i, keep_cursor=True) for i in history_area: self.history_area_parser(area=i, keep_cursor=True) break def dxy_crawler(self): # 全国疫情 while True: logger.info('开始爬取丁香园数据') self.crawl_timestamp = int( datetime.datetime.timestamp(datetime.datetime.now()) * 1000) try: r = self.session.get( url='https://3g.dxy.cn/newh5/view/pneumonia') except requests.exceptions.ChunkedEncodingError: self.session.headers.update({"user-agent": ua.random}) continue soup = BeautifulSoup(r.content, 'lxml') overall_information = re.search( r'(\{"id".*\}\})\}', str(soup.find('script', attrs={'id': 'getStatisticsService'}))) area_information = re.search( r'\[(.*)\]', str(soup.find('script', attrs={'id': 'getAreaStat'}))) abroad_information = re.search( r'\[(.*)\]', str( soup.find('script', attrs={'id': 'getListByCountryTypeService2'}))) if not overall_information or not area_information: continue logger.info('丁香园数据数据读取成功,正在写入数据库') self.overall_parser( overall_information=json.loads(overall_information.group(1))) logger.info('丁香园overall数据写入完毕') self.area_parser( area_information=json.loads(area_information.group(0))) self.abroad_parser( abroad_information=json.loads(abroad_information.group(0))) logger.info('丁香园area数据写入完毕') logger.info('丁香园数据爬取完毕') break def location_crawler(self): # 具体疫情区域 while True: logger.info('开始爬取详细地理位置数据') locations = [] try: fail_count, count = 0, 0 while fail_count < 5: location = self.session.get( url="https://assets.cbndata.org/2019-nCoV/{}/data.json" .format(count)) if location.ok is True: count += 1 locations.append(location) else: fail_count += 1 except requests.exceptions.ChunkedEncodingError: logger.info('详细地理位置数据读取失败,正在重试') self.session.headers.update({"user-agent": ua.random}) continue logger.info('详细地理位置数据数据读取成功,正在写入数据库') for location in locations: location = json.loads(location.text)['data'] for i in location: self.location_parser(i, keep_cursor=True) self.db.close_cursor() logger.info('详细地理位置数据爬取完毕') break def tencent_crawler(self): # 每日新增 while True: logger.info('开始爬取腾讯数据') try: daily = self.session.get( url= "https://view.inews.qq.com/g2/getOnsInfo?name=disease_other" ) except requests.exceptions.ChunkedEncodingError: self.session.headers.update({"user-agent": ua.random}) logger.info('腾讯数据数据读取失败,正在重试') continue logger.info('腾讯数据数据读取成功,正在写入数据库') daily_json = json.loads(daily.text)['data'] daily_dict = json.loads(daily_json) day_add_list = daily_dict['chinaDayAddList'] day_list = daily_dict['chinaDayList'] for daily in day_add_list: self.day_add_list_parser(daily, keep_cursor=True) for daily in day_list: self.day_list_parser(daily, keep_cursor=True) self.db.close_cursor() logger.info('腾讯数据爬取完毕') break def overall_parser(self, overall_information, keep_cursor=False): self.db.open_cursor() overall_information['countRemark'] = overall_information[ 'countRemark'].replace(' 疑似', ',疑似').replace(' 治愈', ',治愈').replace( ' 死亡', ',死亡').replace(' ', '') data = dict() data['countRemark'] = overall_information['countRemark'] data['virus'] = self.change_remark('virus', overall_information) data['infectSource'] = self.change_remark('infectSource', overall_information) data['passWay'] = self.change_remark('passWay', overall_information) data['remark1'] = overall_information['remark1'] data['remark2'] = overall_information['remark2'] data['remark3'] = overall_information['remark3'] data['remark4'] = overall_information['remark4'] data['remark5'] = overall_information['remark5'] data['confirmedCount'] = overall_information['confirmedCount'] data['suspectedCount'] = overall_information['suspectedCount'] data['curedCount'] = overall_information['curedCount'] data['deadCount'] = overall_information['deadCount'] if 'updateTime' in overall_information: data['updateTime'] = overall_information['updateTime'] else: data['updateTime'] = self.crawl_timestamp is_repeat = self.db.is_repeat(collection='DXYOverall', data=data) if not is_repeat: self.db.insert(collection='DXYOverall', data=data) self.db.close_cursor(keep_cursor) @staticmethod def change_remark(key, data): res = data[key] if '说明' in data[key]: operated = 'note{}'.format(data[key][-1]) if operated in data: res = data[operated] return res def province_parser(self, province_information): provinces = json.loads(province_information.group(0)) for province in provinces: province.pop('id') province.pop('tags') province.pop('sort') province['comment'] = province['comment'].replace(' ', '') if self.db.find_one(collection='DXYProvince', data=province): continue province['crawlTime'] = self.crawl_timestamp province['country'] = country_type.get(province['countryType']) self.db.insert(collection='DXYProvince', data=province) def area_parser(self, area_information, keep_cursor=False): self.db.open_cursor() for area in area_information: area['comment'] = area['comment'].replace(' ', '') area['country'] = '中国' area['continents'] = '亚洲' if 'updateTime' not in area: area['updateTime'] = self.crawl_timestamp area.pop('locationId') area['cities'] = json.dumps(area['cities']) is_repeat = self.db.is_repeat(collection='DXYArea', data=area) if not is_repeat: self.db.insert(collection='DXYArea', data=area) self.db.close_cursor(keep_cursor) def abroad_parser(self, abroad_information, keep_cursor=False): countries = abroad_information self.db.open_cursor() for country in countries: country['country'] = country.get('provinceName') country['provinceShortName'] = country.get('provinceName') country['comment'] = country['comment'].replace(' ', '') data = dict() data['continents'] = country['continents'] data['country'] = country['country'] data['provinceName'] = country['provinceName'] data['provinceShortName'] = country['provinceShortName'] data['confirmedCount'] = country['confirmedCount'] data['suspectedCount'] = country['suspectedCount'] data['curedCount'] = country['curedCount'] data['deadCount'] = country['deadCount'] if 'cities' in country: data['cities'] = country['cities'] else: data['cities'] = '[]' data['comment'] = country['comment'] if 'updateTime' not in data: data['updateTime'] = self.crawl_timestamp is_repeat = self.db.is_repeat(collection='DXYArea', data=data) if not is_repeat: self.db.insert(collection='DXYArea', data=data) self.db.close_cursor(keep_cursor) def history_area_parser(self, area, keep_cursor=False): self.db.open_cursor() if area['country'] == '中国': area['continents'] = '亚洲' else: area['continents'] = 'temp' if 'cities' in area: area['cities'] = json.dumps(area['cities']) else: area['cities'] = '[]' is_repeat = self.db.is_repeat(collection='DXYArea', data=area) if not is_repeat: self.db.insert(collection='DXYArea', data=area) self.db.close_cursor(keep_cursor) def location_parser(self, location, keep_cursor=False): self.db.open_cursor() if 'longitude' not in location: return False is_repeat = self.db.is_repeat(collection='location', data=location) if not is_repeat: self.db.insert(collection='location', data=location) else: try: self.db.update(collection='location', data=location) except: pass self.db.close_cursor(keep_cursor) def day_add_list_parser(self, daily, keep_cursor=False): self.db.open_cursor() is_repeat = self.db.is_repeat(collection='day_add_list', data=daily) if not is_repeat: self.db.insert(collection='day_add_list', data=daily) self.db.close_cursor(keep_cursor) def day_list_parser(self, day_add_list, keep_cursor=False): self.db.open_cursor() is_repeat = self.db.is_repeat(collection='day_list', data=day_add_list) if not is_repeat: self.db.insert(collection='day_list', data=day_add_list) self.db.close_cursor(keep_cursor)