예제 #1
0
class Crawler:
    def __init__(self):
        self.session = requests.session()
        self.session.headers.update(headers)
        self.db = DB()
        self.crawl_timestamp = int()
        self.url = "https://3g.dxy.cn/newh5/view/pneumonia"

    def run(self):
        while True:
            self.crawler()
            time.sleep(60)

    def crawler(self):
        while True:
            self.crawl_timestamp = int(
                datetime.datetime.timestamp(datetime.datetime.now()) * 1000)
            r = self.session.get(url=self.url)
            soup = BeautifulSoup(r.content, 'lxml')
            overall_information = re.search(
                r'\{("id".*?)\}',
                str(soup.find('script', attrs={'id': 'getStatisticsService'})))
            province_information = re.search(
                r'\[(.*?)\]',
                str(
                    soup.find('script',
                              attrs={'id': 'getListByCountryTypeService1'})))
            area_information = re.search(
                r'\[(.*)\]',
                str(soup.find('script', attrs={'id': 'getAreaStat'})))
            abroad_information = re.search(
                r'\[(.*)\]',
                str(
                    soup.find('script',
                              attrs={'id': 'getListByCountryTypeService2'})))
            news = re.search(
                r'\[(.*?)\]',
                str(soup.find('script', attrs={'id': 'getTimelineService'})))

            if not overall_information or not province_information or not area_information or not news:
                continue

            self.overall_parser(overall_information=overall_information)
            self.province_parser(province_information=province_information)
            self.area_parser(area_information=area_information)
            self.abroad_parser(abroad_information=abroad_information)
            self.news_parser(news=news)

            break

        logger.info('Successfully crawled.')

    def overall_parser(self, overall_information):
        overall_information = json.loads(overall_information.group(0))
        overall_information.pop('id')
        overall_information.pop('createTime')
        overall_information.pop('modifyTime')
        overall_information.pop('imgUrl')
        overall_information.pop('deleted')
        overall_information['countRemark'] = overall_information[
            'countRemark'].replace(' 疑似', ',疑似').replace(' 治愈', ',治愈').replace(
                ' 死亡', ',死亡').replace(' ', '')
        if not self.db.find_one(collection='DXYOverall',
                                data=overall_information):
            overall_information['updateTime'] = self.crawl_timestamp
            overall_information = regex_parser(content=overall_information,
                                               key='countRemark')

            self.db.insert(collection='DXYOverall', data=overall_information)

    def province_parser(self, province_information):
        provinces = json.loads(province_information.group(0))
        for province in provinces:
            province.pop('id')
            province.pop('tags')
            province.pop('sort')
            province['comment'] = province['comment'].replace(' ', '')
            if self.db.find_one(collection='DXYProvince', data=province):
                continue
            province['crawlTime'] = self.crawl_timestamp
            province['country'] = country_type.get(province['countryType'])

            self.db.insert(collection='DXYProvince', data=province)

    def area_parser(self, area_information):
        area_information = json.loads(area_information.group(0))
        for area in area_information:
            area['comment'] = area['comment'].replace(' ', '')
            if self.db.find_one(collection='DXYArea', data=area):
                continue
            area['country'] = '中国'
            area['updateTime'] = self.crawl_timestamp

            self.db.insert(collection='DXYArea', data=area)

    def abroad_parser(self, abroad_information):
        countries = json.loads(abroad_information.group(0))
        for country in countries:
            country.pop('id')
            country.pop('tags')
            country.pop('countryType')
            country.pop('provinceId')
            country['country'] = country.get('provinceName')
            country.pop('provinceShortName')
            country.pop('cityName')
            country.pop('sort')

            country['comment'] = country['comment'].replace(' ', '')
            if self.db.find_one(collection='DXYArea', data=country):
                continue
            country['updateTime'] = self.crawl_timestamp

            self.db.insert(collection='DXYArea', data=country)

    def news_parser(self, news):
        news = json.loads(news.group(0))
        for _news in news:
            _news.pop('pubDateStr')
            if self.db.find_one(collection='DXYNews', data=_news):
                continue
            _news['crawlTime'] = self.crawl_timestamp

            self.db.insert(collection='DXYNews', data=_news)
예제 #2
0
class Crawler:
    def __init__(self):
        self.session = requests.session()
        self.session.headers.update(headers)
        self.db = DB()
        self.crawl_timestamp = int()

    def run(self):
        while True:
            self.crawler()
            time.sleep(300)

    def crawler(self):
        while True:
            self.crawl_timestamp = int(
                datetime.datetime.timestamp(datetime.datetime.now()) * 1000)
            try:
                r = self.session.get(
                    url='https://3g.dxy.cn/newh5/view/pneumonia')
            except requests.exceptions.ChunkedEncodingError:
                continue
            except requests.exceptions.ConnectionError:
                logger.warn("Server Disconnected.")
                break
            soup = BeautifulSoup(r.content, 'lxml')

            overall_information = re.search(
                r'\{("id".*?)\]\}',
                str(soup.find('script', attrs={'id': 'getStatisticsService'})))
            province_information = re.search(
                r'\[(.*?)\]',
                str(
                    soup.find('script',
                              attrs={'id': 'getListByCountryTypeService1'})))
            area_information = re.search(
                r'\[(.*)\]',
                str(soup.find('script', attrs={'id': 'getAreaStat'})))
            abroad_information = re.search(
                r'\[(.*)\]',
                str(
                    soup.find('script',
                              attrs={'id': 'getListByCountryTypeService2'})))
            news = re.search(
                r'\[(.*?)\]',
                str(soup.find('script', attrs={'id': 'getTimelineService'})))

            if not overall_information or not province_information or not area_information or not news:
                continue

            self.overall_parser(overall_information=overall_information)
            self.province_parser(province_information=province_information)
            self.area_parser(area_information=area_information)
            self.abroad_parser(abroad_information=abroad_information)
            self.news_parser(news=news)

            break

        while True:
            self.crawl_timestamp = int(
                datetime.datetime.timestamp(datetime.datetime.now()) * 1000)
            try:
                r = self.session.get(
                    url=
                    'https://file1.dxycdn.com/2020/0127/797/3393185293879908067-115.json'
                )
            except requests.exceptions.ChunkedEncodingError:
                continue
            # Use try-except to ensure the .json() method will not raise exception.
            try:
                if r.status_code != 200:
                    continue
                elif r.json().get('code') == 'success':
                    self.rumor_parser(rumors=r.json().get('data'))
                    break
                else:
                    continue
            except json.decoder.JSONDecodeError:
                continue

        logger.info('Successfully crawled.')

    def overall_parser(self, overall_information):
        overall_information = json.loads(overall_information.group(0))
        overall_information.pop('id')
        overall_information.pop('createTime')
        overall_information.pop('modifyTime')
        overall_information.pop('imgUrl')
        overall_information.pop('deleted')
        overall_information['countRemark'] = overall_information[
            'countRemark'].replace(' 疑似', ',疑似').replace(' 治愈', ',治愈').replace(
                ' 死亡', ',死亡').replace(' ', '')

        if not self.db.find_one(collection='DXYOverall',
                                data=overall_information):
            overall_information['updateTime'] = self.crawl_timestamp

            self.db.insert(collection='DXYOverall', data=overall_information)

    def province_parser(self, province_information):
        provinces = json.loads(province_information.group(0))
        for province in provinces:
            province.pop('id')
            province.pop('tags')
            province.pop('sort')
            province['comment'] = province['comment'].replace(' ', '')

            if self.db.find_one(collection='DXYProvince', data=province):
                continue

            province['provinceEnglishName'] = city_name_map[
                province['provinceShortName']]['engName']
            province['crawlTime'] = self.crawl_timestamp
            province['country'] = country_type_map.get(province['countryType'])

            self.db.insert(collection='DXYProvince', data=province)

    def area_parser(self, area_information):
        area_information = json.loads(area_information.group(0))
        for area in area_information:
            area['comment'] = area['comment'].replace(' ', '')

            # Because the cities are given other attributes,
            # this part should not be used when checking the identical document.
            cities_backup = area.pop('cities')

            if self.db.find_one(collection='DXYArea', data=area):
                continue

            # If this document is not in current database, insert this attribute back to the document.
            area['cities'] = cities_backup

            area['countryName'] = '中国'
            area['countryEnglishName'] = 'China'
            area['continentName'] = '亚洲'
            area['continentEnglishName'] = 'Asia'
            area['provinceEnglishName'] = city_name_map[
                area['provinceShortName']]['engName']

            for city in area['cities']:
                if city['cityName'] != '待明确地区':
                    try:
                        city['cityEnglishName'] = city_name_map[area[
                            'provinceShortName']]['cities'][city['cityName']]
                    except KeyError:
                        print(area['provinceShortName'], city['cityName'])
                        pass
                else:
                    city['cityEnglishName'] = 'Area not defined'

            area['updateTime'] = self.crawl_timestamp

            self.db.insert(collection='DXYArea', data=area)

    def abroad_parser(self, abroad_information):
        countries = json.loads(abroad_information.group(0))
        for country in countries:
            country.pop('id')
            country.pop('tags')
            country.pop('countryType')
            country.pop('provinceId')
            country.pop('cityName')
            country.pop('sort')
            # The original provinceShortName are blank string
            country.pop('provinceShortName')
            # Rename the key continents to continentName
            country['continentName'] = country.pop('continents')
            # Ding Xiang Yuan have a large number of duplicates,
            # values are all the same, but the modifyTime are different.
            # I suppose the modifyTime is modification time for all documents, other than for only this document.
            # So this field will be popped out.
            country.pop('modifyTime')
            # createTime is also different even if the values are same.
            # Originally, the createTime represent the first diagnosis of the virus in this area,
            # but it seems different for abroad information.
            country.pop('createTime')

            country['comment'] = country['comment'].replace(' ', '')

            if self.db.find_one(collection='DXYArea', data=country):
                continue

            country['countryName'] = country.get('provinceName')
            country['provinceShortName'] = country.get('provinceName')
            country['continentEnglishName'] = continent_name_map.get(
                country['continentName'])
            country['countryEnglishName'] = country_name_map.get(
                country['countryName'])
            country['provinceEnglishName'] = country_name_map.get(
                country['countryName'])

            country['updateTime'] = self.crawl_timestamp

            self.db.insert(collection='DXYArea', data=country)

    def news_parser(self, news):
        news = json.loads(news.group(0))
        for _news in news:
            _news.pop('pubDateStr')
            if self.db.find_one(collection='DXYNews', data=_news):
                continue
            _news['crawlTime'] = self.crawl_timestamp

            self.db.insert(collection='DXYNews', data=_news)

    def rumor_parser(self, rumors):
        for rumor in rumors:
            rumor.pop('score')
            rumor['body'] = rumor['body'].replace(' ', '')
            if self.db.find_one(collection='DXYRumors', data=rumor):
                continue
            rumor['crawlTime'] = self.crawl_timestamp

            self.db.insert(collection='DXYRumors', data=rumor)
예제 #3
0
class Crawler:
    def __init__(self):
        self.session = requests.session()
        self.session.headers.update(headers)
        self.db = DB()
        self.crawl_timestamp = int()
        self.url = "https://3g.dxy.cn/newh5/view/pneumonia"
        self.rumor_url = "https://file1.dxycdn.com/2020/0127/797/3393185293879908067-115.json"

        self.overall_count = 0
        self.province_count = 0
        self.area_count = 0
        self.news_count = 0
        self.rumor_count = 0

    def run(self):
        while True:
            self.crawl()
            time.sleep(60)

    def crawl(self):
        # reset counters
        self.overall_count = 0
        self.province_count = 0
        self.area_count = 0
        self.news_count = 0
        self.rumor_count = 0

        self.crawl_timestamp = int(datetime.datetime.timestamp(datetime.datetime.now()) * 1000)
        r = self.session.get(url=self.url)
        soup = BeautifulSoup(r.content, 'lxml')
        overall_information = re.search(r'\{("id".*?)\}',
                                        str(soup.find('script', attrs={'id': 'getStatisticsService'})))
        province_information = re.search(r'\[(.*?)\]',
                                         str(soup.find('script', attrs={'id': 'getListByCountryTypeService1'})))
        area_information = re.search(r'\[(.*)\]', str(soup.find('script', attrs={'id': 'getAreaStat'})))
        abroad_information = re.search(r'\[(.*)\]',
                                       str(soup.find('script', attrs={'id': 'getListByCountryTypeService2'})))
        news = re.search(r'\[(.*?)\]', str(soup.find('script', attrs={'id': 'getTimelineService'})))

        rumor_resp = self.session.get(url=self.rumor_url + '?t=' + str(self.crawl_timestamp))
        if rumor_resp.status_code == 200:
            rumor_information = rumor_resp.json()
        else:
            logger.warning("Failed get rumor json. status code: %s, reason: %s."
                           % (rumor_resp.status_code, rumor_resp.reason))
            rumor_information = None

        if overall_information or province_information or area_information or news or rumor_information:
            self.overall_parser(overall_information=overall_information)
            self.province_parser(province_information=province_information)
            self.area_parser(area_information=area_information)
            self.abroad_parser(abroad_information=abroad_information)
            self.news_parser(news=news)
            self.rumor_parser(rumor_information=rumor_information)

        logger.info('Successfully crawled. Added %d overall, %d province, %d area, %d news, %d rumor.' %
                    (self.overall_count, self.province_count, self.area_count, self.news_count, self.rumor_count))

    def overall_parser(self, overall_information):
        if overall_information is None:
            return
        overall_information = json.loads(overall_information.group(0))
        overall_information.pop('id')
        overall_information.pop('createTime')
        overall_information.pop('modifyTime')
        overall_information.pop('imgUrl')
        overall_information.pop('deleted')
        overall_information['countRemark'] = overall_information['countRemark'].replace(' 疑似', ',疑似').replace(' 治愈',
                                                                                                              ',治愈').replace(
            ' 死亡', ',死亡').replace(' ', '')
        if not self.db.find_one(collection='DXYOverall', data=overall_information):
            overall_information['updateTime'] = self.crawl_timestamp
            overall_information = regex_parser(content=overall_information, key='countRemark')

            self.overall_count += 1
            self.db.insert(collection='DXYOverall', data=overall_information)

    def province_parser(self, province_information):
        if province_information is None:
            return
        provinces = json.loads(province_information.group(0))
        for province in provinces:
            province.pop('id')
            province.pop('tags')
            province.pop('sort')
            province['comment'] = province['comment'].replace(' ', '')
            if self.db.find_one(collection='DXYProvince', data=province):
                continue
            province['crawlTime'] = self.crawl_timestamp
            province['country'] = country_type.get(province['countryType'])

            self.province_count += 1
            self.db.insert(collection='DXYProvince', data=province)

    def area_parser(self, area_information):
        if area_information is None:
            return
        area_information = json.loads(area_information.group(0))
        for area in area_information:
            area['comment'] = area['comment'].replace(' ', '')
            if self.db.find_one(collection='DXYArea', data=area):
                continue
            area['country'] = '中国'
            area['updateTime'] = self.crawl_timestamp

            self.area_count += 1
            self.db.insert(collection='DXYArea', data=area)

    def abroad_parser(self, abroad_information):
        if abroad_information is None:
            return
        countries = json.loads(abroad_information.group(0))
        for country in countries:
            country.pop('id')
            country.pop('tags')
            country.pop('countryType')
            country.pop('provinceId')
            country['country'] = country.get('provinceName')
            country['provinceShortName'] = country.get('provinceName')
            country.pop('cityName')
            country.pop('sort')

            country['comment'] = country['comment'].replace(' ', '')
            if self.db.find_one(collection='DXYArea', data=country):
                continue
            country['updateTime'] = self.crawl_timestamp

            self.area_count += 1
            self.db.insert(collection='DXYArea', data=country)

    def news_parser(self, news):
        if news is None:
            return
        news = json.loads(news.group(0))
        for _news in news:
            _news.pop('pubDateStr')
            if self.db.find_one(collection='DXYNews', data=_news):
                continue
            _news['crawlTime'] = self.crawl_timestamp

            self.news_count += 1
            self.db.insert(collection='DXYNews', data=_news)

    def rumor_parser(self, rumor_information):
        if rumor_information is None:
            return
        rumor = rumor_information['data']
        for _rumor in rumor:
            if self.db.find_one(collection='DXYRumor', data=_rumor):
                continue

            _rumor['crawlTime'] = self.crawl_timestamp

            self.rumor_count += 1
            self.db.insert(collection='DXYRumor', data=_rumor)
예제 #4
0
class Crawler:
    def __init__(self):
        self.session = requests.session()
        self.db = DB()
        self.crawl_timestamp = int()

    def run(self):
        while True:
            self.crawler()
            time.sleep(60)

    def crawler(self):
        while True:
            self.session.headers.update(
                {'user-agent': random.choice(user_agent_list)})
            self.crawl_timestamp = int(time.time() * 1000)
            try:
                r = self.session.get(
                    url='https://ncov.dxy.cn/ncovh5/view/pneumonia')
            except requests.exceptions.ChunkedEncodingError:
                continue
            # soup = BeautifulSoup(r.content, 'lxml')
            soup = BeautifulSoup(r.content, 'html.parser')

            overall_information = re.search(
                r'(\{"id".*\})\}',
                str(soup.find('script', attrs={'id': 'getStatisticsService'})))
            if overall_information:
                self.overall_parser(overall_information=overall_information)

            area_information = re.search(
                r'\[(.*)\]',
                str(soup.find('script', attrs={'id': 'getAreaStat'})))
            if area_information:

                self.area_parser(area_information=area_information)

            abroad_information = re.search(
                r'\[(.*)\]',
                str(
                    soup.find('script',
                              attrs={'id':
                                     'getListByCountryTypeService2true'})))
            if abroad_information:
                self.abroad_parser(abroad_information=abroad_information)

            news_chinese = re.search(
                r'\[(.*?)\]',
                str(soup.find('script', attrs={'id': 'getTimelineService1'})))
            if news_chinese:
                self.news_parser(news=news_chinese)

            news_english = re.search(
                r'\[(.*?)\]',
                str(soup.find('script', attrs={'id': 'getTimelineService2'})))
            if news_english:
                self.news_parser(news=news_english)

            rumors = re.search(
                r'\[(.*?)\]',
                str(soup.find('script', attrs={'id': 'getIndexRumorList'})))
            if rumors:
                self.rumor_parser(rumors=rumors)

            if not overall_information or \
                    not area_information or \
                    not abroad_information or \
                    not news_chinese or \
                    not news_english or \
                    not rumors:
                time.sleep(3)
                continue

            break

        logger.info('Successfully crawled.')

    def overall_parser(self, overall_information):
        overall_information = json.loads(overall_information.group(1))
        overall_information.pop('id')
        overall_information.pop('createTime')
        overall_information.pop('modifyTime')
        overall_information.pop('imgUrl')
        overall_information.pop('deleted')
        overall_information['countRemark'] = overall_information[
            'countRemark'].replace(' 疑似', ',疑似').replace(' 治愈', ',治愈').replace(
                ' 死亡', ',死亡').replace(' ', '')

        if not self.db.find_one(collection='DXYOverall',
                                data=overall_information):
            overall_information['updateTime'] = self.crawl_timestamp

            self.db.insert(collection='DXYOverall', data=overall_information)

    def province_parser(self, province_information):
        provinces = json.loads(province_information.group(0))
        for province in provinces:
            province.pop('id')
            province.pop('tags')
            province.pop('sort')
            province['comment'] = province['comment'].replace(' ', '')

            if self.db.find_one(collection='DXYProvince', data=province):
                continue

            province['provinceEnglishName'] = city_name_map[
                province['provinceShortName']]['engName']
            province['crawlTime'] = self.crawl_timestamp
            province['country'] = country_type_map.get(province['countryType'])

            self.db.insert(collection='DXYProvince', data=province)

    def area_parser(self, area_information):
        area_information = json.loads(area_information.group(0))
        for area in area_information:
            area['comment'] = area['comment'].replace(' ', '')

            # Because the cities are given other attributes,
            # this part should not be used when checking the identical document.
            cities_backup = area.pop('cities')

            if self.db.find_one(collection='DXYArea', data=area):
                continue

            # If this document is not in current database, insert this attribute back to the document.
            area['cities'] = cities_backup

            area['countryName'] = '中国'
            area['countryEnglishName'] = 'China'
            area['continentName'] = '亚洲'
            area['continentEnglishName'] = 'Asia'
            area['provinceEnglishName'] = city_name_map[
                area['provinceShortName']]['engName']

            for city in area['cities']:
                if city['cityName'] != '待明确地区':
                    try:
                        city['cityEnglishName'] = city_name_map[area[
                            'provinceShortName']]['cities'][city['cityName']]
                    except KeyError:
                        print(area['provinceShortName'], city['cityName'])
                        pass
                else:
                    city['cityEnglishName'] = 'Area not defined'

            area['updateTime'] = self.crawl_timestamp

            self.db.insert(collection='DXYArea', data=area)

    def abroad_parser(self, abroad_information):
        countries = json.loads(abroad_information.group(0))
        for country in countries:
            try:
                country.pop('id')
                country.pop('tags')
                country.pop('sort')
                # Ding Xiang Yuan have a large number of duplicates,
                # values are all the same, but the modifyTime are different.
                # I suppose the modifyTime is modification time for all documents, other than for only this document.
                # So this field will be popped out.
                country.pop('modifyTime')
                # createTime is also different even if the values are same.
                # Originally, the createTime represent the first diagnosis of the virus in this area,
                # but it seems different for abroad information.
                country.pop('createTime')
                country['comment'] = country['comment'].replace(' ', '')
            except KeyError:
                pass
            country.pop('countryType')
            country.pop('provinceId')
            country.pop('cityName')
            # The original provinceShortName are blank string
            country.pop('provinceShortName')
            # Rename the key continents to continentName
            country['continentName'] = country.pop('continents')

            if self.db.find_one(collection='DXYArea', data=country):
                continue

            country['countryName'] = country.get('provinceName')
            country['provinceShortName'] = country.get('provinceName')
            country['continentEnglishName'] = continent_name_map.get(
                country['continentName'])
            country['countryEnglishName'] = country_name_map.get(
                country['countryName'])
            country['provinceEnglishName'] = country_name_map.get(
                country['countryName'])

            country['updateTime'] = self.crawl_timestamp

            self.db.insert(collection='DXYArea', data=country)

    def news_parser(self, news):
        news = json.loads(news.group(0))
        for _news in news:
            _news.pop('pubDateStr')
            if self.db.find_one(collection='DXYNews', data=_news):
                continue
            _news['crawlTime'] = self.crawl_timestamp

            self.db.insert(collection='DXYNews', data=_news)

    def rumor_parser(self, rumors):
        rumors = json.loads(rumors.group(0))
        for rumor in rumors:
            rumor.pop('score')
            rumor['body'] = rumor['body'].replace(' ', '')
            if self.db.find_one(collection='DXYRumors', data=rumor):
                continue
            rumor['crawlTime'] = self.crawl_timestamp

            self.db.insert(collection='DXYRumors', data=rumor)
예제 #5
0
class Crawler:
    def __init__(self):
        self.session = requests.session()
        self.session.headers.update(headers)
        self.db = DB()
        self.crawl_timestamp = int()

    def run(self):
        while True:
            self.crawler()
            time.sleep(1800)

    def crawler(self):
        while True:
            self.crawl_timestamp = int(datetime.datetime.timestamp(datetime.datetime.now()) * 1000)
            try:
                r = self.session.get(url='https://3g.dxy.cn/newh5/view/pneumonia')
            except requests.exceptions.ChunkedEncodingError:
                continue
            soup = BeautifulSoup(r.content, 'lxml')

            # overall_information = re.search(r'\{("id".*?)\]\}',str(soup.find('script', attrs={'id': 'getStatisticsService'})))
            # province_information = re.search(r'\[(.*?)\]', str(soup.find('script', attrs={'id': 'getListByCountryTypeService1'})))
            area_information = re.search(r'\[(.*)\]', str(soup.find('script', attrs={'id': 'getAreaStat'})))
            # abroad_information = re.search(r'\[(.*)\]', str(soup.find('script', attrs={'id': 'getListByCountryTypeService2'})))
            # news = re.search(r'\[(.*?)\]', str(soup.find('script', attrs={'id': 'getTimelineService'})))

            if not area_information:
                continue

            # self.overall_parser(overall_information=overall_information)
            # self.province_parser(province_information=province_information)
            self.area_parser(area_information=area_information)
            # if abroad_information is not None:
            # self.abroad_parser(abroad_information=abroad_information)
            # self.news_parser(news=news)

            break

        while True:
            self.crawl_timestamp = int(datetime.datetime.timestamp(datetime.datetime.now()) * 1000)
            try:
                r = self.session.get(url='https://file1.dxycdn.com/2020/0127/797/3393185293879908067-115.json')
            except requests.exceptions.ChunkedEncodingError:
                continue
            # Use try-except to ensure the .json() method will not raise exception.
            try:
                if r.status_code != 200:
                    continue
                elif r.json().get('code') == 'success':
                    self.rumor_parser(rumors=r.json().get('data'))
                    break
                else:
                    continue
            except json.decoder.JSONDecodeError:
                continue

        logger.info('Successfully crawled.')

    def overall_parser(self, overall_information):
        overall_information = json.loads(overall_information.group(0))
        overall_information.pop('id')
        overall_information.pop('createTime')
        overall_information.pop('modifyTime')
        overall_information.pop('imgUrl')
        overall_information.pop('deleted')
        overall_information['countRemark'] = overall_information['countRemark'].replace(' 疑似', ',疑似').replace(' 治愈',
                                                                                                              ',治愈').replace(
            ' 死亡', ',死亡').replace(' ', '')
        if not self.db.find_one(collection='DXYOverall', data=overall_information):
            overall_information['updateTime'] = self.crawl_timestamp

            self.db.insert(collection='DXYOverall', data=overall_information)

    def province_parser(self, province_information):
        provinces = json.loads(province_information.group(0))
        for province in provinces:
            province.pop('id')
            province.pop('tags')
            province.pop('sort')
            province['comment'] = province['comment'].replace(' ', '')
            if self.db.find_one(collection='DXYProvince', data=province):
                continue
            province['crawlTime'] = self.crawl_timestamp
            province['country'] = country_type.get(province['countryType'])

            self.db.insert(collection='DXYProvince', data=province)

    def area_parser(self, area_information):
        area_information = json.loads(area_information.group(0))
        for area in area_information:
            area['comment'] = area['comment'].replace(' ', '')
            # 检查数据是否有变更,无变更则跳过
            if self.db.find_one(collection='DXYArea', data=area):
                continue
            # 数据库存在对应省份数据则更新
            area_in_mongo = self.db.find_one(collection='DXYArea',
                                             data={'provinceShortName': area['provinceShortName']})
            if area_in_mongo is not None:
                self.db.update_one(collection='DXYArea', query={'_id': area_in_mongo['_id']}, data_after={
                    '$set': {'confirmedCount': area['confirmedCount'], 'suspectedCount': area['suspectedCount'],
                             'currentConfirmedCount': area['currentConfirmedCount'],
                             'curedCount': area['curedCount'], 'deadCount': area['deadCount'],
                             'cities': area['cities'], 'updateTime': self.crawl_timestamp}})
                logger.info(area['provinceShortName'] + "进行数据更新")
                continue
            # 不存在则插入
            area['country'] = '中国'
            area['updateTime'] = self.crawl_timestamp
            self.db.insert(collection='DXYArea', data=area)
            logger.info(area['provinceShortName'] + "进行数据新增")

    def abroad_parser(self, abroad_information):
        countries = json.loads(abroad_information.group(0))
        for country in countries:
            country.pop('id')
            country.pop('tags')
            country.pop('countryType')
            country.pop('provinceId')
            country['country'] = country.get('provinceName')
            country['provinceShortName'] = country.get('provinceName')
            country.pop('cityName')
            country.pop('sort')

            country['comment'] = country['comment'].replace(' ', '')
            if self.db.find_one(collection='DXYArea', data=country):
                continue
            country['updateTime'] = self.crawl_timestamp

            self.db.insert(collection='DXYArea', data=country)

    def news_parser(self, news):
        news = json.loads(news.group(0))
        for _news in news:
            _news.pop('pubDateStr')
            if self.db.find_one(collection='DXYNews', data=_news):
                continue
            _news['crawlTime'] = self.crawl_timestamp

            self.db.insert(collection='DXYNews', data=_news)

    def rumor_parser(self, rumors):
        for rumor in rumors:
            rumor.pop('score')
            rumor['body'] = rumor['body'].replace(' ', '')
            if self.db.find_one(collection='DXYRumors', data=rumor):
                continue
            rumor['crawlTime'] = self.crawl_timestamp

            self.db.insert(collection='DXYRumors', data=rumor)
예제 #6
0
class Crawler:
    def __init__(self):
        self.session = requests.session()
        self.db = DB()
        self.crawl_timestamp = int()

    def run(self):
        # because I need run the crawler in Java, don't nead regularly perform here
        #while True:
        self.crawler()
            #time.sleep(60*30) #time internal 30min

    def crawler(self):
        while True:
            self.session.headers.update(
                {
                    'user-agent': random.choice(user_agent_list)
                }
            )
            self.crawl_timestamp = int(time.time() * 1000)

            try:
                r = self.session.get(url='https://ncov.dxy.cn/ncovh5/view/pneumonia')
            except requests.exceptions.ChunkedEncodingError:
                continue
            soup = BeautifulSoup(r.content, 'lxml')

            area_information = re.search(r'\[(.*)\]', str(soup.find('script', attrs={'id': 'getAreaStat'})))
            if area_information:
                self.area_parser(area_information=area_information)
            if not area_information:
                time.sleep(3)
                continue
            break

        logger.info('Successfully crawled.')

    def area_parser(self, area_information):
        area_information = json.loads(area_information.group(0))
        self.db.connect_db()
        self.db.create_db_table()
        for area in area_information:
            # used to hold each record
            rowdata = {}
            cities_backup = area.pop('cities')
            area['cities'] = cities_backup
            area['countryName'] = '中国'
            area['countryEnglishName'] = 'China'
            area['continentName'] = '亚洲'
            area['continentEnglishName'] = 'Asia'
            area['provinceEnglishName'] = city_name_map[area['provinceShortName']]['engName']
            area['updateTime'] = self.crawl_timestamp
            # add data to rowdata
            rowdata['updateTime'] = area['updateTime']
            rowdata['provinceName'] = area['provinceName']
            # If cities is not empty, parse every city in it.
            if cities_backup:
                for city in area['cities']:
                    if city['cityName'] != '待明确地区':
                        try:
                            city['cityEnglishName'] = city_name_map[area['provinceShortName']]['cities'][city['cityName']]
                            rowdata['cityName'] = city['cityName']
                            rowdata['currentConfirmedCount'] = city['currentConfirmedCount']
                            rowdata['confirmedCount'] = city['confirmedCount']
                            rowdata['suspectedCount'] = city['suspectedCount']
                            rowdata['curedCount'] = city['curedCount']
                            rowdata['deadCount'] = city['deadCount']
                            rowdata['locationId'] = city['locationId']
                            # add data into database
                            self.db.insert(rowdata)
                        except KeyError:
                            print(area['provinceShortName'], city['cityName'])
                            pass
                    else:
                        city['cityEnglishName'] = 'Area not defined'

            else:
                try:
                    rowdata['cityName'] = area['provinceName']
                    rowdata['currentConfirmedCount'] = area['currentConfirmedCount']
                    rowdata['confirmedCount'] = area['confirmedCount']
                    rowdata['suspectedCount'] = area['suspectedCount']
                    rowdata['curedCount'] = area['curedCount']
                    rowdata['deadCount'] = area['deadCount']
                    rowdata['locationId'] = area['locationId']
                    self.db.insert(rowdata)
                except KeyError:
                    print("No such key")
                    pass

        self.db.close_db_connection()
예제 #7
0
class Crawler:
    def __init__(self):
        self.session = requests.session()
        self.session.headers.update({"user-agent": ua.random})
        self.db = DB()
        self.crawl_timestamp = int()

    def run(self):
        while True:
            self.tencent_crawler()
            self.dxy_crawler()
            # self.location_crawler()
            logger.info('所有数据爬取完毕,开始沉睡1小时')
            time.sleep(3600)
            logger.info('沉睡结束')

    def history_data_crawler(self):
        while True:
            try:
                overall = self.session.get(
                    url="https://lab.isaaclin.cn/nCoV/api/overall?latest=0")
                area = self.session.get(
                    url="https://lab.isaaclin.cn/nCoV/api/area")

            except requests.exceptions.ChunkedEncodingError:
                self.session.headers.update({"user-agent": ua.random})
                continue

            history_overall = json.loads(overall.text)['results']
            history_area = json.loads(area.text)['results']
            for i in history_overall:
                self.overall_parser(overall_information=i, keep_cursor=True)

            for i in history_area:
                self.history_area_parser(area=i, keep_cursor=True)

            break

    def dxy_crawler(self):
        # 全国疫情
        while True:
            logger.info('开始爬取丁香园数据')
            self.crawl_timestamp = int(
                datetime.datetime.timestamp(datetime.datetime.now()) * 1000)
            try:
                r = self.session.get(
                    url='https://3g.dxy.cn/newh5/view/pneumonia')
            except requests.exceptions.ChunkedEncodingError:
                self.session.headers.update({"user-agent": ua.random})
                continue
            soup = BeautifulSoup(r.content, 'lxml')

            overall_information = re.search(
                r'(\{"id".*\}\})\}',
                str(soup.find('script', attrs={'id': 'getStatisticsService'})))
            area_information = re.search(
                r'\[(.*)\]',
                str(soup.find('script', attrs={'id': 'getAreaStat'})))
            abroad_information = re.search(
                r'\[(.*)\]',
                str(
                    soup.find('script',
                              attrs={'id': 'getListByCountryTypeService2'})))

            if not overall_information or not area_information:
                continue
            logger.info('丁香园数据数据读取成功,正在写入数据库')
            self.overall_parser(
                overall_information=json.loads(overall_information.group(1)))
            logger.info('丁香园overall数据写入完毕')
            self.area_parser(
                area_information=json.loads(area_information.group(0)))
            self.abroad_parser(
                abroad_information=json.loads(abroad_information.group(0)))
            logger.info('丁香园area数据写入完毕')
            logger.info('丁香园数据爬取完毕')
            break

    def location_crawler(self):

        # 具体疫情区域
        while True:
            logger.info('开始爬取详细地理位置数据')
            locations = []
            try:
                fail_count, count = 0, 0
                while fail_count < 5:
                    location = self.session.get(
                        url="https://assets.cbndata.org/2019-nCoV/{}/data.json"
                        .format(count))
                    if location.ok is True:
                        count += 1
                        locations.append(location)
                    else:
                        fail_count += 1

            except requests.exceptions.ChunkedEncodingError:
                logger.info('详细地理位置数据读取失败,正在重试')
                self.session.headers.update({"user-agent": ua.random})
                continue
            logger.info('详细地理位置数据数据读取成功,正在写入数据库')
            for location in locations:
                location = json.loads(location.text)['data']
                for i in location:
                    self.location_parser(i, keep_cursor=True)
                self.db.close_cursor()
            logger.info('详细地理位置数据爬取完毕')
            break

    def tencent_crawler(self):

        # 每日新增
        while True:
            logger.info('开始爬取腾讯数据')
            try:
                daily = self.session.get(
                    url=
                    "https://view.inews.qq.com/g2/getOnsInfo?name=disease_other"
                )
            except requests.exceptions.ChunkedEncodingError:
                self.session.headers.update({"user-agent": ua.random})
                logger.info('腾讯数据数据读取失败,正在重试')
                continue
            logger.info('腾讯数据数据读取成功,正在写入数据库')
            daily_json = json.loads(daily.text)['data']
            daily_dict = json.loads(daily_json)
            day_add_list = daily_dict['chinaDayAddList']
            day_list = daily_dict['chinaDayList']
            for daily in day_add_list:
                self.day_add_list_parser(daily, keep_cursor=True)
            for daily in day_list:
                self.day_list_parser(daily, keep_cursor=True)
            self.db.close_cursor()
            logger.info('腾讯数据爬取完毕')
            break

    def overall_parser(self, overall_information, keep_cursor=False):
        self.db.open_cursor()
        overall_information['countRemark'] = overall_information[
            'countRemark'].replace(' 疑似', ',疑似').replace(' 治愈', ',治愈').replace(
                ' 死亡', ',死亡').replace(' ', '')
        data = dict()
        data['countRemark'] = overall_information['countRemark']
        data['virus'] = self.change_remark('virus', overall_information)
        data['infectSource'] = self.change_remark('infectSource',
                                                  overall_information)
        data['passWay'] = self.change_remark('passWay', overall_information)
        data['remark1'] = overall_information['remark1']
        data['remark2'] = overall_information['remark2']
        data['remark3'] = overall_information['remark3']
        data['remark4'] = overall_information['remark4']
        data['remark5'] = overall_information['remark5']
        data['confirmedCount'] = overall_information['confirmedCount']
        data['suspectedCount'] = overall_information['suspectedCount']
        data['curedCount'] = overall_information['curedCount']
        data['deadCount'] = overall_information['deadCount']
        if 'updateTime' in overall_information:
            data['updateTime'] = overall_information['updateTime']
        else:
            data['updateTime'] = self.crawl_timestamp

        is_repeat = self.db.is_repeat(collection='DXYOverall', data=data)
        if not is_repeat:
            self.db.insert(collection='DXYOverall', data=data)
        self.db.close_cursor(keep_cursor)

    @staticmethod
    def change_remark(key, data):
        res = data[key]
        if '说明' in data[key]:
            operated = 'note{}'.format(data[key][-1])
            if operated in data:
                res = data[operated]

        return res

    def province_parser(self, province_information):
        provinces = json.loads(province_information.group(0))
        for province in provinces:
            province.pop('id')
            province.pop('tags')
            province.pop('sort')
            province['comment'] = province['comment'].replace(' ', '')
            if self.db.find_one(collection='DXYProvince', data=province):
                continue
            province['crawlTime'] = self.crawl_timestamp
            province['country'] = country_type.get(province['countryType'])

            self.db.insert(collection='DXYProvince', data=province)

    def area_parser(self, area_information, keep_cursor=False):
        self.db.open_cursor()
        for area in area_information:
            area['comment'] = area['comment'].replace(' ', '')
            area['country'] = '中国'
            area['continents'] = '亚洲'
            if 'updateTime' not in area:
                area['updateTime'] = self.crawl_timestamp
            area.pop('locationId')
            area['cities'] = json.dumps(area['cities'])
            is_repeat = self.db.is_repeat(collection='DXYArea', data=area)
            if not is_repeat:
                self.db.insert(collection='DXYArea', data=area)
        self.db.close_cursor(keep_cursor)

    def abroad_parser(self, abroad_information, keep_cursor=False):
        countries = abroad_information
        self.db.open_cursor()
        for country in countries:
            country['country'] = country.get('provinceName')
            country['provinceShortName'] = country.get('provinceName')
            country['comment'] = country['comment'].replace(' ', '')

            data = dict()
            data['continents'] = country['continents']
            data['country'] = country['country']
            data['provinceName'] = country['provinceName']
            data['provinceShortName'] = country['provinceShortName']
            data['confirmedCount'] = country['confirmedCount']
            data['suspectedCount'] = country['suspectedCount']
            data['curedCount'] = country['curedCount']
            data['deadCount'] = country['deadCount']
            if 'cities' in country:
                data['cities'] = country['cities']
            else:
                data['cities'] = '[]'
            data['comment'] = country['comment']
            if 'updateTime' not in data:
                data['updateTime'] = self.crawl_timestamp

            is_repeat = self.db.is_repeat(collection='DXYArea', data=data)
            if not is_repeat:
                self.db.insert(collection='DXYArea', data=data)
        self.db.close_cursor(keep_cursor)

    def history_area_parser(self, area, keep_cursor=False):
        self.db.open_cursor()
        if area['country'] == '中国':
            area['continents'] = '亚洲'
        else:
            area['continents'] = 'temp'
        if 'cities' in area:
            area['cities'] = json.dumps(area['cities'])
        else:
            area['cities'] = '[]'
        is_repeat = self.db.is_repeat(collection='DXYArea', data=area)
        if not is_repeat:
            self.db.insert(collection='DXYArea', data=area)
        self.db.close_cursor(keep_cursor)

    def location_parser(self, location, keep_cursor=False):
        self.db.open_cursor()
        if 'longitude' not in location:
            return False
        is_repeat = self.db.is_repeat(collection='location', data=location)
        if not is_repeat:
            self.db.insert(collection='location', data=location)
        else:
            try:
                self.db.update(collection='location', data=location)
            except:
                pass
        self.db.close_cursor(keep_cursor)

    def day_add_list_parser(self, daily, keep_cursor=False):
        self.db.open_cursor()
        is_repeat = self.db.is_repeat(collection='day_add_list', data=daily)
        if not is_repeat:
            self.db.insert(collection='day_add_list', data=daily)

        self.db.close_cursor(keep_cursor)

    def day_list_parser(self, day_add_list, keep_cursor=False):
        self.db.open_cursor()
        is_repeat = self.db.is_repeat(collection='day_list', data=day_add_list)
        if not is_repeat:
            self.db.insert(collection='day_list', data=day_add_list)

        self.db.close_cursor(keep_cursor)