def get_vic_carto_datapoints(): # [{"postcode": 3006, "_new": 0, "activedisp": "Five or fewer active cases", # "cases": 65, "ratedisp": 13, "population": 18811}, # # {"rows":[{"cartodb_id":287, # "the_geom":"0101000020E6100000386744696F226240E10B93A982E942C0", # "the_geom_webmercator":"0101000020110F00008D3881B2A4CD6E41295C51BCE25F51C1", # "postcode":3126,"affected":0,"band":"None","lat":-37.8243,"lon":145.0761, # "suburbs":"Camberwell East, Canterbury","active":0,"rate":0,"total":2}, date = (datetime.now() - timedelta(hours=20, minutes=30)).strftime('%Y_%m_%d') dir_ = get_data_dir() / 'vic' / 'newmap_postcode' / date if not exists(dir_): makedirs(dir_) postcode_json_path = dir_ / 'postcode.json' if not exists(postcode_json_path): urlretrieve( "https://dhhs.carto.com:443/api/v2/sql?q=select%20*%20from%20public.covid19_postcodes", postcode_json_path) r = DataPointMerger() dates = sorted(listdir(get_data_dir() / 'vic' / 'newmap_postcode')) if not date in dates: dates.append(date) for i_date in dates: path = get_data_dir( ) / 'vic' / 'newmap_postcode' / i_date / 'postcode.json' r.extend(_get_datapoints(i_date, path)) return r
def _get_regions_data(self): # # {"data":[[["Aglonas novads",0,"0","56.0965 27.114","Aglonas novads"], out = DataPointMerger() base_dir = self.get_path_in_dir('') for date in self.iter_nonempty_dirs(base_dir): r = self.sdpf() path = f'{base_dir}/{date}/regions_data.json' print(path) with open(path, 'r', encoding='utf-8') as f: data = f.read() if '<!DOCTYPE HTML>' in data: continue # WARNING!!! - TODO: Add agegroup data, etc from the new page!!! =================================================== data = json.loads(data) for i_data in data['data']: for region_name, value, *leftover in i_data: print(region_name) # Only confirmed and deaths are shown in the dashboard date = datetime.datetime.fromtimestamp( data['refreshed'] / 1000.0).strftime('%Y_%m_%d') if value is not None: r.append(region_schema=Schemas.ADMIN_1, region_parent='LV', region_child=region_name, datatype=DataTypes.TOTAL, value=int(value), date_updated=date, source_url=self.SOURCE_URL) out.extend(r) return out
def __postcode_datapoints_to_lga(self, SOURCE_URL, postcode_to_lga, r, source_id): # Convert postcode to LGA where possible new_r = DataPointMerger() added_to_lga = set() processed_postcode = set() mapping = Counter() for datapoint in sorted(r, key=lambda i: i.date_updated): if datapoint.region_schema == Schemas.LGA: added_to_lga.add(( datapoint.region_child, datapoint.datatype )) continue elif datapoint.region_schema != Schemas.POSTCODE: continue elif datapoint.region_child in postcode_to_lga: lga = postcode_to_lga[datapoint.region_child] else: lga = 'unknown' if datapoint.region_child != 'unknown': print("NOT FOUND:", datapoint.region_child) # continue # WARNINIG!!! ================================================================================ if (datapoint.region_child, datapoint.datatype, datapoint.date_updated) in processed_postcode: #print("IGNORING DOUBLE-UP:", datapoint) continue processed_postcode.add((datapoint.region_child, datapoint.datatype, datapoint.date_updated)) #if lga == 'cumberland': # print('USING:', datapoint) mapping[ lga, datapoint.datatype, datapoint.date_updated ] += datapoint.value new_r.extend(r) for (lga, datatype, date_updated), value in mapping.items(): if (lga, datatype) in added_to_lga: # Don't add to LGA if available using direct data! continue new_r.append(DataPoint( region_schema=Schemas.LGA, region_parent='AU-NSW', region_child=lga, datatype=datatype, value=value, date_updated=date_updated, source_url=SOURCE_URL, source_id=source_id )) return new_r
def get_datapoints(self): r = DataPointMerger() totals = Counter() added_totals = defaultdict(set) tests = Counter() added_tests = defaultdict(set) for date in sorted(listdir(get_overseas_dir() / 'fr' / 'esridata')): r.extend(self._get_positive_by_department(date, totals, added_totals, tests, added_tests)) return r
def get_datapoints(self): r = DataPointMerger() with set_locale('tr_TR.utf8'): for dir_ in sorted(listdir(self.output_dir)): with open(self.output_dir / dir_ / 'covid19_in_turkey.html', 'r', encoding='utf-8') as f: html = f.read() r.extend(self._get_total_datapoints(html)) r.extend(self._get_recovered_death_datapoints(html)) return r
def get_datapoints(self): ua = URLArchiver(f'sa/dashboard') i_r = DataPointMerger() for period in ua.iter_periods(): for subperiod_id, subdir in ua.iter_paths_for_period(period): path = ua.get_path(subdir) with open(path, 'r', encoding='utf-8') as f: data = json.loads(f.read()) i_r.extend(self._get_from_json(data)) return i_r
def get_datapoints(self): # { # "regions-history": [ # { # "date": "2020-04-20", # "regions": [ # { # "cases_per_100000_people": 11.88, # "population": 606170, # "region_cases": 72, # "region_en_name": "East Macedonia and Thrace", # "region_gr_name": "Ανατολική Μακεδονία και Θράκη" # }, out = DataPointMerger() base_dir = self.get_path_in_dir('') region_map = { 'west greece': 'GR-G', 'central greece': 'GR-H', 'north aegean': 'GR-K', 'west macedonia': 'GR-C', 'without permanent residency in greece': 'Other', 'under investigation': 'Unknown', } for date in self.iter_nonempty_dirs(base_dir): if date >= '2020_10_10': continue # NOTE ME: This stopped getting updated!!! r = self.sdpf() path = f'{base_dir}/{date}/regions.json' with open(path, 'r', encoding='utf-8') as f: data = json.loads(f.read()) for day_dict in data['regions-history']: date = self.convert_date(day_dict['date']) for region_dict in day_dict['regions']: r.append(region_schema=Schemas.ADMIN_1, region_parent='GR', region_child=region_map.get( region_dict['region_en_name'].lower(), region_dict['region_en_name']), datatype=DataTypes.TOTAL, value=int(region_dict['region_cases']), date_updated=date, source_url=self.SOURCE_URL) out.extend(r) return out
def get_datapoints(self): r = DataPointMerger() ua = URLArchiver(f'{self.STATE_NAME}/current_statistics') ua.get_url_data(self.STATS_BY_REGION_URL_2, cache=False if ALWAYS_DOWNLOAD_LISTING else True) for period in ua.iter_periods(): for subperiod_id, subdir in ua.iter_paths_for_period(period): path = ua.get_path(subdir) with open(path, 'r', encoding='utf-8', errors='ignore') as f: html = f.read() cbr = self._get_total_cases_by_region( self.STATS_BY_REGION_URL_2, html) if cbr: r.extend(cbr) total = self._get_total_cases(self.STATS_BY_REGION_URL_2, html) if total: r.append(total) new = self._get_total_new_cases(self.STATS_BY_REGION_URL_2, html) if new: r.append(new) tested = self._get_total_cases_tested( self.STATS_BY_REGION_URL_2, html) if tested: r.append(tested) age_breakdown = self._get_total_age_breakdown( self.STATS_BY_REGION_URL_2, html) if age_breakdown: r.extend(age_breakdown) dhr = self._get_total_dhr(self.STATS_BY_REGION_URL_2, html) if dhr: r.extend(dhr) soi = self._get_total_source_of_infection( self.STATS_BY_REGION_URL_2, html) if soi: r.extend(soi) r.extend(StateNewsBase.get_datapoints(self)) return r
def _get_recovered_sum(self): out = DataPointMerger() base_dir = self.get_path_in_dir('') for date in self.iter_nonempty_dirs(base_dir): r = self.sdpf() path = f'{base_dir}/{date}/tw_corona.html' print(path) try: with open(path, 'r', encoding='utf-8') as f: html = f.read() except UnicodeDecodeError: import brotli with open(path, 'rb') as f: html = brotli.decompress(f.read()).decode('utf-8') new_data_template = '.geojson","series":[{' if new_data_template in html: data = '[{%s}]' % html.split(new_data_template)[-1].split( '}],')[0] else: data = html.split('var jdata1 = ')[-1].split( '\n')[0].strip().strip(';').replace("'", '"') for item in json.loads(data): # [{'code':'Taipei City', 'value':118}, ...] region = place_map[item['code'].strip().lower()] r.append(region_schema=Schemas.ADMIN_1, region_parent='TW', region_child=region, datatype=DataTypes.TOTAL, value=int(item['value']), date_updated=date, source_url=self.SOURCE_URL) out.extend(r) return out
def _get_county_data(self): # ORIGID CountyName PopulationCensus16 IGEasting IGNorthing Lat Long UniqueGeographicIdentifier ConfirmedCovidCases PopulationProportionCovidCases ConfirmedCovidDeaths ConfirmedCovidRecovered x y FID TimeStampDate # Carlow 56932 278661 163444 52.7168 -6.8367 http://data.geohive.ie/resource/county/2ae19629-143d-13a3-e055-000000000001 175 307.384247874657 -6.8367 52.7168 194903 2020/07/01 00:00:00+00 # Cavan 76176 246380 304501 53.9878 -7.2937 http://data.geohive.ie/resource/county/2ae19629-1448-13a3-e055-000000000001 862 1131.5900021004 -7.2937 53.9878 194904 2020/07/01 00:00:00+00 # Clare 118817 133493 182732 52.8917 -8.9889 http://data.geohive.ie/resource/county/2ae19629-1450-13a3-e055-000000000001 368 309.719989563783 -8.9889 52.8917 194905 2020/07/01 00:00:00+00 out = DataPointMerger() base_dir = self.get_path_in_dir('') for date in self.iter_nonempty_dirs(base_dir): r = self.sdpf() path = f'{base_dir}/{date}/county_data.csv' with open(path, 'r', encoding='utf-8') as f: for item in csv.DictReader(f): date = self.convert_date(item['TimeStampDate'].split()[0]) for datatype, value in ( (DataTypes.TOTAL, int(item['ConfirmedCovidCases'])), (DataTypes.STATUS_DEATHS, int(item['ConfirmedCovidDeaths'] or 0)), (DataTypes.STATUS_RECOVERED, int(item['ConfirmedCovidRecovered'] or 0)), (DataTypes.STATUS_ACTIVE, int(item['ConfirmedCovidCases'] or 0) - int(item['ConfirmedCovidDeaths'] or 0) - int(item['ConfirmedCovidRecovered'] or 0))): r.append(region_schema=Schemas.ADMIN_1, region_parent='IE', region_child=item['CountyName'], datatype=datatype, value=value, date_updated=date, source_url=self.SOURCE_URL) out.extend(r) return out
def _get_regions_data(self): out = DataPointMerger() base_dir = self.get_path_in_dir('') for date in self.iter_nonempty_dirs(base_dir): r = self.sdpf() path = f'{base_dir}/{date}/regions_data.json' with open(path, 'r') as f: data = json.loads(f.read()) for feature in data['features']: attributes = feature['attributes'] r.append(region_schema=Schemas.ADMIN_1, region_parent='LV', region_child=attributes['Nos_pilns'], datatype=DataTypes.TOTAL, value=int(attributes['Covid_sasl']), date_updated=date, source_url=self.SOURCE_URL) out.extend(r) return out
def _get_recovered_sum(self): out = DataPointMerger() base_dir = self.get_path_in_dir('') for date in self.iter_nonempty_dirs(base_dir): r = self.sdpf() path = f'{base_dir}/{date}/is_index.html' with open(path, 'rb') as f: data = f.read() data = data.decode('utf-8') # TODO: There are quite a few more stats!! regional_stats = data.split( '[[[null,{"font-weight":"700","value":"Infections"},' '{"font-weight":"700","value":"Quarantine"}],')[1].split( ']]],')[0] #print(regional_stats) regional_stats = json.loads(f'[{regional_stats}]]') for region, infections_dict, quarantine_dict in regional_stats: region = place_map[region] r.append( region_schema=Schemas.ADMIN_1, region_parent='IS', region_child=region, datatype=DataTypes.TOTAL, # This changed to be an int from a dict on 9 Jun value=int(infections_dict['value']) if isinstance( infections_dict, dict) else int(infections_dict), date_updated=date, source_url=self.SOURCE_URL) out.extend(r) return out
def get_datapoints(self): r = DataPointMerger() # Start Hyper with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU ) as hyper: for date in sorted(listdir(self.output_dir)): self.__unzip_date(date) # active_cases/Data/dash-transmission/vic-details-transmissions-pub-extract.hyper # cases/Data/dash-charts/vic_detailed_prep Extract_daily-pubextract.hyper r.extend(self.__get_genderagegroup_datapoints(hyper, date)) r.extend(self.__get_transmissions_datapoints(hyper, date)) r.extend( self.__get_transmissions_over_time_datapoints(hyper, date)) return r
def _get_by_datos_isciii(self): out = DataPointMerger() # Fecha,cod_ine,CCAA,Casos,PCR+,TestAc+,Hospitalizados,UCI,Fallecidos,Recuperados # 2020-02-20,01,Andalucía,0,0,,,,, # 2020-02-20,02,Aragón,,0,,,,, # 2020-02-20,03,Asturias,,0,,,,, # 2020-02-20,04,Baleares,,1,,,,, # 2020-02-20,05,Canarias,,1,,,,, # 2020-02-20,06,Cantabria,,0,,,,, # 2020-02-20,08,Castilla La Mancha,,0,,,,, # 2020-02-20,07,Castilla y León,,0,,,,, with open( self.get_path_in_dir('COVID 19/ccaa_covid19_datos_isciii.csv'), 'r', encoding='utf-8') as f: r = self.sdpf() for item in csv.DictReader(f): #print(item) date = self.convert_date(item['Fecha']) ac_code = region_map[item['CCAA']] if item['Casos']: r.append(region_schema=Schemas.ADMIN_1, region_parent='ES', region_child=ac_code, datatype=DataTypes.TOTAL, value=int(item['Casos']), date_updated=date, source_url=self.SOURCE_URL) if item['PCR+'] or item[ 'TestAc+']: # NOTE ME: I'm combining PCR and other tests!! r.append(region_schema=Schemas.ADMIN_1, region_parent='ES', region_child=ac_code, datatype=DataTypes.TESTS_TOTAL, value=int(item['PCR+'] or 0) + int(item['TestAc+'] or 0), date_updated=date, source_url=self.SOURCE_URL) if item['Hospitalizados']: r.append(region_schema=Schemas.ADMIN_1, region_parent='ES', region_child=ac_code, datatype=DataTypes.STATUS_HOSPITALIZED, value=int(item['Hospitalizados']), date_updated=date, source_url=self.SOURCE_URL) if item['Fallecidos']: r.append(region_schema=Schemas.ADMIN_1, region_parent='ES', region_child=ac_code, datatype=DataTypes.STATUS_DEATHS, value=int(item['Fallecidos']), date_updated=date, source_url=self.SOURCE_URL) if item.get('Recuperados'): r.append(region_schema=Schemas.ADMIN_1, region_parent='ES', region_child=ac_code, datatype=DataTypes.STATUS_RECOVERED, value=int(item['Recuperados']), date_updated=date, source_url=self.SOURCE_URL) out.extend(r) return out
def _get_positive_by_department(self, date, totals, added_totals, tests, added_tests): out = DataPointMerger() base_path = get_overseas_dir() / 'fr' / 'esridata' / date for fnam in sorted(listdir(base_path)): r = self.sdpf() with open(base_path / fnam, 'r', encoding='utf-8') as f: data = json.loads(f.read()) print(base_path, fnam, data) for property in data['features']: attributes = property['attributes'] print(property) date = self.convert_date(attributes['Jour']) try: region_child = 'FR-%02d' % int(attributes['CODE_DEPT']) except ValueError: # e.g. 2A region_child = 'FR-%s' % attributes['CODE_DEPT'] for datatype, value in ( (DataTypes.STATUS_HOSPITALIZED, attributes['Hospitalisation_T']), #(DataTypes.FIXME, attributes['Hospitalisation_H']), #(DataTypes.FIXME, attributes['Hospitalisation_F']), (DataTypes.STATUS_ICU, attributes['Reanimation_T']), #(DataTypes.FIXME, attributes['Reanimation_H']), #(DataTypes.FIXME, attributes['Reanimation_F']), (DataTypes.STATUS_DEATHS, attributes['Deces_T']), #(DataTypes.FIXME, attributes['Deces_H']), #(DataTypes.FIXME, attributes['Deces_F']), (DataTypes.NEW, attributes['Tests_Viro_P']), #(DataTypes.TESTS_TOTAL, attributes['Tests_Viro_T']) # FIXME: This is new tests! ): if value is None: continue r.append( region_schema=Schemas.ADMIN_1, region_parent='FR', region_child=region_child, datatype=datatype, value=value, date_updated=date, source_url=self.SOURCE_URL ) # I don't think Nbre_Cas_Confirmes is ever not None assert attributes['Nbre_Cas_Confirmes'] is None, attributes if attributes['Tests_Viro_P'] and not date in added_totals[region_child]: added_totals[region_child].add(date) totals[region_child] += attributes['Tests_Viro_P']# or attributes['Nbre_Cas_Confirmes']) r.append( region_schema=Schemas.ADMIN_1, region_parent='FR', region_child=region_child, datatype=DataTypes.TOTAL, value=totals[region_child], date_updated=date, source_url=self.SOURCE_URL ) if attributes['Tests_Viro_T'] and not date in added_tests[region_child]: added_tests[region_child].add(date) tests[region_child] += attributes['Tests_Viro_T'] r.append( region_schema=Schemas.ADMIN_1, region_parent='FR', region_child=region_child, datatype=DataTypes.TESTS_TOTAL, value=tests[region_child], date_updated=date, source_url=self.SOURCE_URL ) out.extend(r) return out
def get_datapoints(self): r = DataPointMerger() for date in r.iter_unprocessed_dates( sorted(listdir(get_data_dir() / 'vic' / 'csv_data'))): r.extend(self._get_postcode_datapoints(date)) r.extend(self._get_lga_datapoints(date)) #print(get_data_dir(), date) if (get_data_dir() / 'vic' / 'csv_data' / date / 'agegroup.csv').exists(): r.extend(self._get_agegroup_datapoints(date)) if (get_data_dir() / 'vic' / 'csv_data' / date / 'all_lga.csv').exists(): r.extend(self._get_all_lga_datapoints(date)) if (get_data_dir() / 'vic' / 'csv_data' / date / 'all_lga_acquired_source').exists(): r.extend(self._get_all_lga_acquired_source_datapoints(date)) if (get_data_dir() / 'vic' / 'csv_data' / date / 'all_acquired_source').exists(): r.extend(self._get_all_acquired_source_datapoints(date)) return r
def get_datapoints(self): r = DataPointMerger() for path in SA_TABLEAU_MAP_DIR.iterdir(): r.extend(self._get_datapoints(path)) return r
def _get_municipality_data(self): out = DataPointMerger() base_dir = self.get_path_in_dir('') for date in self.iter_nonempty_dirs(base_dir): r = self.sdpf() path = f'{base_dir}/{date}/municipality_data.json' try: with open(path, 'r', encoding='utf-8') as f: data = json.loads(f.read()) except UnicodeDecodeError: import brotli with open(path, 'rb') as f: data = json.loads( brotli.decompress(f.read()).decode('utf-8')) if not 'features' in data and date <= '2020_12_21': continue for feature in data['features']: attributes = feature['attributes'] if date >= '2020_12_22': #print(attributes) # formula is incidence = (cases / population) * 100000 # incidence / 100000 = cases / population # incidence / 100000 * population = cases print( attributes['Incidência'] / 100000 * attributes['Total'], attributes['Incidência'] / 100000, attributes['Incidência'], attributes['Total']) confirmed = round( attributes['Incidência'] / 100000 * attributes['Total']) if attributes['Incidência'] else 0 if confirmed is not None: r.append( region_schema=Schemas.PT_MUNICIPALITY, region_parent='PT', # 'Distrito' -> district?? region_child=attributes['Concelho'], datatype=DataTypes.TOTAL, value=confirmed, date_updated=date, source_url=self.SOURCE_URL) else: if attributes['Data_Conc'] is None: continue # Only confirmed and deaths are shown in the dashboard date = datetime.datetime.fromtimestamp( attributes['Data_Conc'] / 1000.0).strftime('%Y_%m_%d') confirmed = attributes['ConfirmadosAcumulado_Conc'] if confirmed is not None: r.append( region_schema=Schemas.PT_MUNICIPALITY, region_parent='PT', # 'Distrito' -> district?? region_child=attributes['Concelho'], datatype=DataTypes.TOTAL, value=int(confirmed), date_updated=date, source_url=self.SOURCE_URL) out.extend(r) return out
def _get_regions_data(self): out = DataPointMerger() base_dir = self.get_path_in_dir('') for date in self.iter_nonempty_dirs(base_dir): r = self.sdpf() path = f'{base_dir}/{date}/regions_data.json' print(path) with open(path, 'r', encoding='utf-8') as f: data = f.read() if 'Error performing query operation' in data: continue data = data.replace( 'dojo_request_script_callbacks.dojo_request_script57(', '').rstrip().rstrip(');') data = json.loads(data) for feature in data['features']: attributes = feature['attributes'] #print(attributes) # Only confirmed and deaths are shown in the dashboard date = datetime.datetime.fromtimestamp( attributes['ATNUJINTA'] / 1000).strftime('%Y_%m_%d') region_parent = 'LT' #county_map[attributes['APSKRITIS']] # NOTE ME: Not sure it's worth splitting for now, but can map to admin_1 region_child = attributes['SAV_PAV'].replace(' r.', '').replace( ' m.', '') confirmed = attributes['ATVEJAI'] deaths = attributes['MIRTYS_KITA'] positive = attributes['VYRAI'] recovered = attributes['PASVEIKO'] women = attributes['MOTERYS'] men = attributes['VYRAI'] treated = attributes['GYDOMA'] unknown = attributes['MIRE'] if confirmed is not None: r.append(region_schema=Schemas.LT_MUNICIPALITY, region_parent=region_parent, region_child=region_child, datatype=DataTypes.TOTAL, value=int(confirmed), date_updated=date, source_url=self.SOURCE_URL) if positive is not None: r.append(region_schema=Schemas.LT_MUNICIPALITY, region_parent=region_parent, region_child=region_child, datatype=DataTypes.CONFIRMED, value=int(positive), date_updated=date, source_url=self.SOURCE_URL) if women is not None: r.append(region_schema=Schemas.LT_MUNICIPALITY, region_parent=region_parent, region_child=region_child, datatype=DataTypes.TOTAL_FEMALE, value=int(women), date_updated=date, source_url=self.SOURCE_URL) if men is not None: r.append(region_schema=Schemas.LT_MUNICIPALITY, region_parent=region_parent, region_child=region_child, datatype=DataTypes.TOTAL_MALE, value=int(men), date_updated=date, source_url=self.SOURCE_URL) if deaths is not None: r.append(region_schema=Schemas.LT_MUNICIPALITY, region_parent=region_parent, region_child=region_child, datatype=DataTypes.STATUS_DEATHS, value=int(deaths), date_updated=date, source_url=self.SOURCE_URL) if recovered is not None: r.append(region_schema=Schemas.LT_MUNICIPALITY, region_parent=region_parent, region_child=region_child, datatype=DataTypes.STATUS_RECOVERED, value=int(recovered), date_updated=date, source_url=self.SOURCE_URL) if recovered is not None and confirmed is not None and deaths is not None: r.append(region_schema=Schemas.LT_MUNICIPALITY, region_parent=region_parent, region_child=region_child, datatype=DataTypes.STATUS_RECOVERED, value=int(confirmed) - int(recovered) - int(deaths), date_updated=date, source_url=self.SOURCE_URL) out.extend(r) return out