def abroad_parser(self, abroad_information): countries = json.loads(abroad_information.group(0)) for country in countries: country.pop('id') country.pop('tags') country.pop('countryType') country.pop('provinceId') country.pop('cityName') country.pop('sort') # The original provinceShortName are blank string country.pop('provinceShortName') # Rename the key continents to continentName country['continentName'] = country.pop('continents') country['comment'] = country['comment'].replace(' ', '') if self.db.find_one(collection='DXYArea', data=country): continue country['countryName'] = country.get('provinceName') country['provinceShortName'] = country.get('provinceName') country['continentEnglishName'] = continent_name_map.get( country['continentName']) country['countryEnglishName'] = country_name_map.get( country['countryName']) country['provinceEnglishName'] = country_name_map.get( country['countryName']) country['updateTime'] = self.crawl_timestamp self.db.insert(collection='DXYArea', data=country)
def abroad_parser(self, abroad_information): countries = json.loads(abroad_information.group(0)) for country in countries: try: country.pop('id') country.pop('tags') country.pop('sort') # Ding Xiang Yuan have a large number of duplicates, # values are all the same, but the modifyTime are different. # I suppose the modifyTime is modification time for all documents, other than for only this document. # So this field will be popped out. country.pop('modifyTime') # createTime is also different even if the values are same. # Originally, the createTime represent the first diagnosis of the virus in this area, # but it seems different for abroad information. country.pop('createTime') country['comment'] = country['comment'].replace(' ', '') except KeyError: pass country.pop('countryType') country.pop('provinceId') country.pop('cityName') # The original provinceShortName are blank string country.pop('provinceShortName') # Rename the key continents to continentName country['continentName'] = country.pop('continents') if self.db.find_one(collection='area', data=country): continue country['countryName'] = country.get('provinceName') country['provinceShortName'] = country.get('provinceName') country['continentEnglishName'] = continent_name_map.get( country['continentName']) country['countryEnglishName'] = country_name_map.get( country['countryName']) country['provinceEnglishName'] = country_name_map.get( country['countryName']) country['updateTime'] = self.crawl_timestamp print(country) self.db.insert(collection='area', data=country)
def abroad_parser(self, abroad_information): countries = json.loads(abroad_information.group(0)) for country in countries: country.pop('id') country.pop('tags') country.pop('countryType') country.pop('provinceId') country.pop('cityName') country.pop('sort') # The original provinceShortName are blank string country.pop('provinceShortName') # Rename the key continents to continentName country['continentName'] = country.pop('continents') country['comment'] = country['comment'].replace(' ', '') country['countryName'] = country.get('provinceName') country['provinceShortName'] = country.get('provinceName') country['continentEnglishName'] = continent_name_map.get(country['continentName']) country['countryEnglishName'] = country_name_map.get(country['countryName']) country['provinceEnglishName'] = country_name_map.get(country['countryName']) return countries