def _get_from_path(self, path_data_page, date): r = [] with open(path_data_page, 'r', encoding='utf-8') as f: for item in csv.DictReader(f): r.append( DataPoint( region_schema=Schemas.POSTCODE, region_parent='AU-VIC', region_child=item['Postcode'], datatype=DataTypes.TOTAL, value=int(item['Confirmed cases (ever)'] or 0), date_updated=date, # FIXME!!!!! source_url=self.SOURCE_URL, source_id=self.SOURCE_ID)) r.append( DataPoint( region_schema=Schemas.POSTCODE, region_parent='AU-VIC', region_child=item['Postcode'], datatype=DataTypes.STATUS_ACTIVE, value=int(item['Active cases (current)'] or 0), date_updated=date, # FIXME!!!!! source_url=self.SOURCE_URL, source_id=self.SOURCE_ID)) return r
def _get_timeline(self): # {"UpdateDate":"14\/05\/2020 11:35", # "Source":"https:\/\/covid19.th-stat.com\/", # "DevBy":"https:\/\/www.kidkarnmai.com\/", # "SeverBy":"https:\/\/smilehost.asia\/", # "Data":[{ # "Date":"01\/01\/2020", # "NewConfirmed":0, # "NewRecovered":0, # "NewHospitalized":0, # "NewDeaths":0, # "Confirmed":0, # "Recovered":0, # "Hospitalized":0, # "Deaths":0 # }, ... r = [] text = self.get_text('timeline.json', include_revision=True) data = json.loads(text) for item in data['Data']: if not item['Date']: continue date = self.convert_date(item['Date'], formats=('%m/%d/%Y', )) r.append( DataPoint(region_schema=Schemas.ADMIN_0, region_parent='', region_child='TH', datatype=DataTypes.TOTAL, value=int(item['Confirmed']), date_updated=date, source_url=self.SOURCE_URL)) r.append( DataPoint(region_schema=Schemas.ADMIN_0, region_parent='', region_child='TH', datatype=DataTypes.STATUS_RECOVERED, value=int(item['Recovered']), date_updated=date, source_url=self.SOURCE_URL)) r.append( DataPoint(region_schema=Schemas.ADMIN_0, region_parent='', region_child='TH', datatype=DataTypes.STATUS_HOSPITALIZED, value=int(item['Hospitalized']), date_updated=date, source_url=self.SOURCE_URL)) r.append( DataPoint(region_schema=Schemas.ADMIN_0, region_parent='', region_child='TH', datatype=DataTypes.STATUS_DEATHS, value=int(item['Deaths']), date_updated=date, source_url=self.SOURCE_URL)) return r
def _get_total_cases_by_region(self, url, html): if url == self.STATS_BY_REGION_URL: tables = self._pq_contains( html, 'table', 'LGA Region', ignore_case=True) or [] du = self._get_date(url, html) r = [] for table in tables: for lga, num_cases in table[1]: lga = pq(lga).text().strip() if lga.lower() == 'total': # This value is very often out of date!!! ==================================================== if False: r.append( DataPoint(region_schema=Schemas.THS, region_parent='au-tas', region_child=pq(table[0][0][0]).text( ).strip().split(' - ')[-1].strip(), datatype=DataTypes.TOTAL, value=int( pq(num_cases).text().replace( ',', '').strip()), date_updated=du, source_url=url)) else: r.append( DataPoint(region_schema=Schemas.LGA, region_parent='au-tas', region_child=lga, datatype=DataTypes.TOTAL, value=int( pq(num_cases).text().replace( ',', '').strip()), date_updated=du, source_url=url)) return r else: table = self._pq_contains(html, 'table', 'Local Government Area', ignore_case=True) du = self._get_date(url, html) r = [] if table: for region_child, lga, num_cases in table[0][1]: r.append( DataPoint(region_schema=Schemas.LGA, region_parent='au-tas', region_child=pq(lga).text().strip(), datatype=DataTypes.TOTAL, value=int( pq(num_cases).text().replace( ',', '').strip()), date_updated=du, source_url=url)) return r
def _get_datapoints(self, path): date = path.name.split('-')[0] print(date) for path in path.iterdir(): print(path) with open(path, 'r', encoding='utf-8') as f: r = [] text = json.loads(f.read()) data = self.get_from_multipart(text, 'dataColumns') values_by_idx = self.get_recursively(data, 'dataColumns')[0]['dataValues'] lga_by_idx = self.get_recursively(data, 'dataColumns')[2]['dataValues'] active_idx = self.get_recursively(data, 'paneColumnsList')[0]['vizPaneColumns'][3]['aliasIndices'] total_idx = self.get_recursively(data, 'paneColumnsList')[0]['vizPaneColumns'][4]['aliasIndices'] for _active, lga in zip(active_idx, lga_by_idx[1:]): if _active in (-44, -70, -71): continue elif _active < 0: continue raise Exception(_active) r.append(DataPoint( region_schema=Schemas.LGA, region_parent='AU-SA', region_child=normalize_locality_name(lga), datatype=DataTypes.STATUS_ACTIVE, value=int(values_by_idx[_active]), date_updated=date, source_url=self.SOURCE_URL, source_id=self.SOURCE_ID )) for _total, lga in zip(total_idx, lga_by_idx[1:]): if _total == -44: continue elif _total < 0: raise Exception(_total) r.append(DataPoint( region_schema=Schemas.LGA, region_parent='AU-SA', region_child=normalize_locality_name(lga), datatype=DataTypes.TOTAL, value=int(values_by_idx[_total]), date_updated=date, source_url=self.SOURCE_URL, source_id=self.SOURCE_ID )) #print(values_by_idx) #print(lga_by_idx) #pprint(total_idx) return r
def _get_source_of_infection(self, updated_date, response_dict): # * Overseas acquired # * Cruise ship acquired (included in overseas acquired) # * Interstate acquired # * Locally acquired - contact of a confirmed case # * Locally acquired - contact not identified # * Under investigation # Normalise it with other states vic_norm_map = { 'Travel overseas': DataTypes.SOURCE_OVERSEAS, 'Contact with a confirmed case': DataTypes.SOURCE_CONFIRMED, 'Acquired in Australia, unknown source': DataTypes.SOURCE_COMMUNITY, 'Under investigation': DataTypes.SOURCE_UNDER_INVESTIGATION } output = [] data = response_dict['source_of_infection'][1] added = set() for source in data['result']['data']['dsr']['DS'][0]['PH'][0]['DM0']: output.append( DataPoint(region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-VIC', datatype=vic_norm_map[source['C'][0]], value=source['C'][1], date_updated=updated_date, source_url=self.SOURCE_URL, source_id=self.SOURCE_ID)) added.add(vic_norm_map[source['C'][0]]) for datatype in vic_norm_map.values(): if datatype in added: continue # Sometimes "under investigation" isn't provided, # but probably can assume at 0 for these days output.append( DataPoint(region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-VIC', datatype=datatype, value=0, date_updated=updated_date, source_url=self.SOURCE_URL, source_id=self.SOURCE_ID)) return output
def get_statistic(self): r = [] data = json.loads(self.get_text('statistic.json', include_revision=True)) by_region = Counter() for region_data in data: # {"dataSet":{"id":1,"code":"COVID19","name":"COVID19 statistics", # "shortName":"COVID19 stat","sourceUrl":"file:///tmp/covid19stat.txt", # "resourceUrl":null,"dataSetGroup":{"id":1,"name":"Default","pos":1, # "icon_resource":null},"pos":1,"isComparable":null,"delimiter":";"}, #print(region_data) for point_dict in region_data['points']: # {"abscissa":{"id":45862,"year":2020,"month":3,"day":10, # "name":"2020-03-10","date":"2020-03-10"},"ordinate":0.0} #print(point_dict['abscissa']['date']) if point_dict['ordinate'] is None: continue region_child = region_map[region_data['name'].lower().strip()] value = int(point_dict['ordinate']) date = self.convert_date(point_dict['abscissa']['date']) by_region[date, region_child] += value r.append(DataPoint( region_schema=Schemas.ADMIN_1, region_parent='RS', region_child=region_child, datatype=DataTypes.NEW, value=value, source_url=self.SOURCE_URL, date_updated=date )) cumulative = Counter() for (date, region_child), value in sorted(by_region.items()): cumulative[region_child] += value r.append(DataPoint( region_schema=Schemas.ADMIN_1, region_parent='RS', region_child=region_child, datatype=DataTypes.TOTAL, value=cumulative[region_child], source_url=self.SOURCE_URL, date_updated=date )) return r
def _get_total_male_female_breakdown(self, url, html): du = self._get_date(url, html) regex = compile( r'Total cases include ([0-9,]+) men and ([0-9,]+) women') match = regex.search(html) if match: men = int(match.group(1).replace(',', '')) women = int(match.group(2).replace(',', '')) men = DataPoint(region_schema=Schemas.ADMIN_1, region_parent='au', region_child='au-vic', date_updated=du, datatype=DataTypes.TOTAL_MALE, value=men, source_url=url) women = DataPoint(region_schema=Schemas.ADMIN_1, region_parent='au', region_child='au-vic', date_updated=du, datatype=DataTypes.TOTAL_FEMALE, value=women, source_url=url) return men, women else: men = self._extract_number_using_regex( compile('total[^0-9.]+?([0-9,]+) men'), html, region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-VIC', source_url=url, datatype=DataTypes.TOTAL_MALE, date_updated=du) women = self._extract_number_using_regex( compile('total[^0-9.]+?([0-9,]+) women'), html, region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-VIC', source_url=url, datatype=DataTypes.TOTAL_FEMALE, date_updated=du) if men is not None and women is not None: return men, women return None
def _get_total_cases_tested(self, url, html): neg_cases = self._extract_number_using_regex( # Seems the WA website's wording can change day-to-day compile(r'([0-9]+[0-9,]*?)' r'([^0-9]*?negative COVID-19 tests|' r'[^0-9]*?tested negative|' r'[^0-9]*?negative)'), html, region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-WA', source_url=url, datatype=DataTypes.TESTS_TOTAL, date_updated=self._get_date(url, html)) pos_cases = self._get_total_cases(url, html) if neg_cases and pos_cases: return DataPoint(region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-WA', datatype=neg_cases.datatype, value=neg_cases.value + pos_cases.value, date_updated=neg_cases.date_updated, source_url=neg_cases.source_url, text_match=(neg_cases.text_match, pos_cases.text_match)) return None
def _get_total_new_cases(self, href, html): c_html = word_to_number(html) if 'same total number as yesterday' in html: # https://www.dhhs.vic.gov.au/coronavirus-update-victoria-27-april-2020 return DataPoint(region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-VIC', datatype=DataTypes.NEW, value=0, date_updated=self._get_date(href, html), source_url=href, text_match='same total number as yesterday') return self._extract_number_using_regex( compile('increase of ([0-9,]+)'), c_html, region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-VIC', datatype=DataTypes.NEW, source_url=href, date_updated=self._get_date( href, html)) or self._extract_number_using_regex( compile('([0-9,]+) new cases'), c_html, region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-VIC', datatype=DataTypes.NEW, source_url=href, date_updated=self._get_date(href, html))
def _get_recovered_sum(self): r = [] base_dir = self.get_path_in_dir('') for date in self.iter_nonempty_dirs(base_dir): path = f'{base_dir}/{date}/is_index.html' with open(path, 'rb') as f: data = f.read() data = data.decode('utf-8') # TODO: There are quite a few more stats!! regional_stats = data.split('[[[null,{"font-weight":"700","value":"Infections"},' '{"font-weight":"700","value":"Quarantine"}],')[1].split(']]],')[0] #print(regional_stats) regional_stats = json.loads(f'[{regional_stats}]]') for region, infections_dict, quarantine_dict in regional_stats: region = place_map[region] r.append(DataPoint( region_schema=Schemas.ADMIN_1, region_parent='IS', region_child=region, datatype=DataTypes.TOTAL, # This changed to be an int from a dict on 9 Jun value=int(infections_dict['value']) if isinstance(infections_dict, dict) else int(infections_dict), date_updated=date, source_url=self.SOURCE_URL )) return r
def _get_total_cases(self, href, html): du = self._get_date(href, html) if href in (self.STATS_BY_REGION_URL, self.STATS_BY_REGION_URL_2): # New format as of 22 April cases = pq(html)('.qh-fact-wrapper .cases span') if cases: return self._extract_number_using_regex( compile('([0-9,]+)'), pq(cases[0]).text().strip(), region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-QLD', datatype=DataTypes.TOTAL, date_updated=du, source_url=href) else: return None # Use new format from the table if possible totals_dict = self.__get_totals_from_table(html) if totals_dict: return DataPoint(region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-QLD', datatype=DataTypes.TOTAL, value=totals_dict['total'], date_updated=du, source_url=href, text_match=None) c_html = word_to_number(html) return self._extract_number_using_regex( (compile('state total to ([0-9,]+)'), compile('total of ([0-9,]+) (?:people|person)')), c_html, region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-QLD', source_url=href, datatype=DataTypes.TOTAL, date_updated=du ) or self._extract_number_using_regex( compile( # Total number changed from being enclosed in a <strong> # tag to a <b> tag, so changed to be as broad as NSW # <strong>Total</strong></td> # <td headers="table59454r1c2"><b>37,334</b></td> r'<td[^>]*?>(?:<[^</>]+>)?Total(?:</[^<>]+>)?</td>' r'[^<]*?<td[^>]*?>.*?([0-9,]+).*?</td>', MULTILINE | DOTALL), c_html, region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-QLD', source_url=href, datatype=DataTypes.TOTAL, date_updated=du)
def _get_all_lga_datapoints(self, date): r = [] current_date = None by_agegroup = ExpiringCounter() with open(get_data_dir() / 'vic' / 'csv_data' / date / 'all_lga.csv', 'r', encoding='utf-8') as f: for row in sorted(csv.DictReader(f), key=lambda x: x['diagnosis_date']) + \ [{'diagnosis_date': '1111-01-01', 'Localgovernmentarea': None}]: date_updated = self.convert_date(row['diagnosis_date']) if current_date != date_updated: if current_date is not None: for lga, value in by_agegroup.items(): r.append( DataPoint(region_schema=Schemas.LGA, region_parent='AU-VIC', region_child=normalize_locality_name( lga.split('(')[0].strip()), datatype=DataTypes.TOTAL, value=int(value), date_updated=current_date, source_url=self.SOURCE_URL, source_id=self.SOURCE_ID)) current_date = date_updated if row['Localgovernmentarea']: by_agegroup[row['Localgovernmentarea'].strip('_')] += 1 return r
def _get_agegroup_datapoints(self, date): r = [] current_date = None by_agegroup = Counter() with open(get_data_dir() / 'vic' / 'csv_data' / date / 'agegroup.csv', 'r', encoding='utf-8') as f: for row in sorted(csv.DictReader(f), key=lambda x: x['diagnosis_date']) + \ [{'diagnosis_date': '1111-01-01', 'agegroup': None}]: assert len( row['diagnosis_date']) in (9, 10), row['diagnosis_date'] date_updated = self.convert_date(row['diagnosis_date']) if current_date != date_updated: if current_date is not None: for agerange, value in by_agegroup.items(): r.append( DataPoint(region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-VIC', datatype=DataTypes.TOTAL, agerange=agerange, value=int(value), date_updated=current_date, source_url=self.SOURCE_URL, source_id=self.SOURCE_ID)) current_date = date_updated if row['agegroup']: by_agegroup[row['agegroup'].strip('_')] += 1 return r
def __get_tests_datapoints(self, SOURCE_URL, path_tests): r = DataPointMerger() with open(path_tests, 'r', encoding='utf-8') as f: for item in json.loads(f.read())['data']: try: item['POA_NAME16'] = str(int(float(item['POA_NAME16']))) except ValueError: pass date = self.__get_partial_date(path_tests, item['Date']) number = int(item['Number']) # recent = item['Recent'] # TODO: ADD ME!!! ======================================================== postcode = item['POA_NAME16'] if item['POA_NAME16'] else 'Unknown' r.append(DataPoint( region_schema=Schemas.POSTCODE, region_parent='AU-NSW', region_child=postcode, datatype=DataTypes.TESTS_TOTAL, value=number, date_updated=date, source_url=SOURCE_URL, source_id=self.SOURCE_ID )) return r
def _get_total_source_of_infection(self, url, html): norm_map = { 'Locally Acquired—close contact with confirmed case': DataTypes.SOURCE_CONFIRMED, 'Locally Acquired—no known contact': DataTypes.SOURCE_COMMUNITY, 'Interstate acquired': DataTypes.SOURCE_INTERSTATE, 'Overseas acquired': DataTypes.SOURCE_OVERSEAS, 'Under investigation': DataTypes.SOURCE_UNDER_INVESTIGATION, } du = self._get_date(url, html) if url == self.STATS_BY_REGION_URL_2: table = pq(html)('#QLD_Cases_Sources_Of_Infection')[0] #print(pq(table).html()) r = [] for header, value in table[0]: header = pq(header).text().strip() if header in ('Confirmed cases', 'Total cases'): continue value = pq(value).text().strip() r.append( DataPoint(region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-QLD', datatype=norm_map[header], value=int(value.replace(',', '')), date_updated=du, source_url=url)) return r else: return []
def _get_lga_datapoints(self, date): # LGA lga_pid population active cases rate new band LGADisplay data_date # Alpine (S) VIC242 12814 0 1 0 0 0 Alpine 29/08/2020 # Ararat (RC) VIC220 11845 1 7 8.4 0 1 Ararat 29/08/2020 # Ballarat (C) VIC241 109505 6 61 5.5 0 1 Ballarat 29/08/2020 # Banyule (C) VIC188 131631 30 437 22.8 0 2 Banyule 29/08/2020 # Bass Coast (S) VIC173 36320 0 11 0 0 0 Bass Coast 29/08/2020 # Baw Baw (S) VIC194 53396 1 15 1.9 0 1 Baw Baw 29/08/2020 # Bayside (C) VIC182 106862 72 227 67.4 6 3 Bayside 29/08/2020 # Benalla (RC) VIC199 14037 0 3 0 0 0 Benalla 29/08/2020 r = [] print("LGA:", get_data_dir() / 'vic' / 'csv_data' / date) with open(get_data_dir() / 'vic' / 'csv_data' / date / 'lga.json', 'r', encoding='utf-8') as f: for row in csv.DictReader(f): #print(row) date_updated = self.convert_date(row['data_date']) for datatype, value in ((DataTypes.STATUS_ACTIVE, row['active']), (DataTypes.TOTAL, row['cases'])): r.append( DataPoint(region_schema=Schemas.LGA, region_parent='AU-VIC', region_child=normalize_locality_name( row['LGA'].split('(')[0].strip()), datatype=datatype, value=int(value), date_updated=date_updated, source_url=self.SOURCE_URL, source_id=self.SOURCE_ID)) return r
def __get_merged_datapoint(self, datapoint): """ Find+remove the previous datapoint (if it exists); return a new datapoint with both values added """ unique_key = self.__get_unique_key(datapoint) if unique_key in self.__datapoint_indexes: replace_index = self.__datapoint_indexes[unique_key] i = self[replace_index] r = DataPoint(region_schema=datapoint.region_schema, region_parent=datapoint.region_parent, region_child=datapoint.region_child, date_updated=datapoint.date_updated, datatype=datapoint.datatype, agerange=datapoint.agerange, value=datapoint.value + i.value, source_url=datapoint.source_url or i.source_url, text_match=datapoint.text_match or i.text_match, source_id=datapoint.source_id) else: replace_index = None r = datapoint return replace_index, r
def _get_regions(self, updated_date, response_dict): output = [] data = response_dict['regions'][1] previous_value = None for region_child in data['result']['data']['dsr']['DS'][0]['PH'][0][ 'DM0']: value, previous_value = self.process_powerbi_value( region_child, previous_value, data) if value[0] is None: continue region_string = value[0].split('(')[0].strip() output.append( DataPoint(region_schema=Schemas.LGA, region_parent='au-vic', region_child=region_string, datatype=DataTypes.TOTAL, value=value[1], date_updated=updated_date, source_url=self.SOURCE_URL, source_id=self.SOURCE_ID)) previous_value = value # print(output[-1]) self.totals_dict[region_string] = value[1] return output
def _get_total_age_breakdown(self, href, html): if href == self.STATS_BY_REGION_URL_2: r = [] table = pq(html)('#QLD_CasesByAgeAndGender')[0][1] du = self._get_date(href, html) for tr in table[1:]: age_group = pq(tr[0]).text().strip() female = int(pq(tr[1]).text().replace(',', '')) male = int(pq(tr[2]).text().replace(',', '')) total = int(pq(tr[3]).text().replace(',', '')) for datatype, value in ((DataTypes.TOTAL_FEMALE, female), (DataTypes.TOTAL_MALE, male), (DataTypes.TOTAL, total)): if value is None: continue r.append( DataPoint(region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-QLD', datatype=datatype, agerange=age_group, value=value, date_updated=du, source_url=href)) return r
def _get_total_age_breakdown(self, href, html): table = self._pq_contains(html, 'table', 'By age group', ignore_case=True) if not table: return # WARNING!!! ======================================================================================= du = self._get_date(href, html) table = table[0] tbody = pq(table)('tbody')[0] tr = tbody[1] ages = [ int(i.replace(',', '').strip()) for i in pq(tr).text().split('\n') ] ages = { '0-29': ages[0], '30-39': ages[1], '40-49': ages[2], '50-59': ages[3], '60-69': ages[4], '70+': ages[5] } r = [] for k, v in ages.items(): r.append( DataPoint(region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-ACT', datatype=DataTypes.TOTAL, agerange=k, value=v, date_updated=du, source_url=href)) return r
def _get_postcode_datapoints(self, date): # postcode population active cases rate new band data_date # 3000 37979 18 119 47.4 0 2 29/08/2020 # 3001 0 0 1 0 0 0 29/08/2020 # 3002 4957 2 14 40.3 0 2 29/08/2020 # 3003 5516 3 36 54.4 0 3 29/08/2020 # 3004 9311 6 63 64.4 2 3 29/08/2020 # 3005 523 0 0 0 0 0 29/08/2020 # 3006 18811 1 64 5.3 0 1 29/08/2020 # 3008 10438 2 49 19.2 0 1 29/08/2020 # 3010 1595 0 0 0 0 0 29/08/2020 # 3011 21464 36 164 167.7 2 4 29/08/2020 r = [] print("PostCode:", get_data_dir() / 'vic' / 'csv_data' / date) with open(get_data_dir() / 'vic' / 'csv_data' / date / 'postcode.json', 'r', encoding='utf-8') as f: for row in csv.DictReader(f): date_updated = self.convert_date(row['data_date']) for datatype, value in ((DataTypes.STATUS_ACTIVE, row['active']), (DataTypes.TOTAL, row['cases'])): r.append( DataPoint(region_schema=Schemas.POSTCODE, region_parent='AU-VIC', region_child=row['postcode'], datatype=datatype, value=int(value), date_updated=date_updated, source_url=self.SOURCE_URL, source_id=self.SOURCE_ID)) return r
def _get_gender_balance_data(self, updated_date, response_dict): r = [] try: data = response_dict['gender_balance'][1] except KeyError: return [] # WARNING!!! ================================================================================== # WARNING: This sometimes has another query before it!!! ======================================================= try: m_f = data['result']['data']['dsr']['DS'][0]['PH'][0]['DM0'] assert m_f[0]['C'][0] in ('Males', 'Male') assert m_f[1]['C'][0] in ('Females', 'Female') except: m_f = data['result']['data']['dsr']['DS'][0]['PH'][0]['DM0'] assert m_f[0]['C'][0] in ('Males', 'Male') assert m_f[1]['C'][0] in ('Females', 'Female') male = m_f[0]['C'][1] try: female = m_f[1]['C'][1] except IndexError: assert m_f[1]['R'] female = male r.append(DataPoint( region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-ACT', datatype=DataTypes.TOTAL_MALE, value=self._to_int(male), date_updated=updated_date, source_url=self.source_url, source_id=self.SOURCE_ID )) r.append(DataPoint( region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-ACT', datatype=DataTypes.TOTAL_FEMALE, value=self._to_int(female), date_updated=updated_date, source_url=self.source_url, source_id=self.SOURCE_ID )) return r
def __postcode_datapoints_to_lga(self, SOURCE_URL, postcode_to_lga, r, source_id): # Convert postcode to LGA where possible new_r = DataPointMerger() added_to_lga = set() processed_postcode = set() mapping = Counter() for datapoint in sorted(r, key=lambda i: i.date_updated): if datapoint.region_schema == Schemas.LGA: added_to_lga.add(( datapoint.region_child, datapoint.datatype )) continue elif datapoint.region_schema != Schemas.POSTCODE: continue elif datapoint.region_child in postcode_to_lga: lga = postcode_to_lga[datapoint.region_child] else: lga = 'unknown' if datapoint.region_child != 'unknown': print("NOT FOUND:", datapoint.region_child) # continue # WARNINIG!!! ================================================================================ if (datapoint.region_child, datapoint.datatype, datapoint.date_updated) in processed_postcode: #print("IGNORING DOUBLE-UP:", datapoint) continue processed_postcode.add((datapoint.region_child, datapoint.datatype, datapoint.date_updated)) #if lga == 'cumberland': # print('USING:', datapoint) mapping[ lga, datapoint.datatype, datapoint.date_updated ] += datapoint.value new_r.extend(r) for (lga, datatype, date_updated), value in mapping.items(): if (lga, datatype) in added_to_lga: # Don't add to LGA if available using direct data! continue new_r.append(DataPoint( region_schema=Schemas.LGA, region_parent='AU-NSW', region_child=lga, datatype=datatype, value=value, date_updated=date_updated, source_url=SOURCE_URL, source_id=source_id )) return new_r
def _get_total_age_breakdown(self, href, html): # TODO: TRANSITION TO https://data.nsw.gov.au/nsw-covid-19-data !! ============================================= if '20200316_02.aspx' in href: # HACK: The very first entry was in a different format with percentages # Maybe I could fix this later, but not sure it's worth it return None r = [] table = self._pq_contains( html, 'table', 'Age Group', ignore_case=True ) if not table: return None table = table[0] du = self._get_date(href, html) for age_group in ( '0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90-100' ): tds = self._pq_contains(table, 'tr', age_group) if not tds: continue tds = tds[0] female = int(pq(tds[1]).text().strip() or 0) male = int(pq(tds[2]).text().strip() or 0) total = int(pq(tds[3]).text().replace(' ', '').strip() or 0) for datatype, value in ( (DataTypes.TOTAL_FEMALE, female), (DataTypes.TOTAL_MALE, male), (DataTypes.TOTAL, total) ): if value is None: continue r.append(DataPoint( region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-NSW', datatype=datatype, agerange=age_group, value=value, date_updated=du, source_url=href )) return r
def _extract_number_using_regex(self, regex, s, source_url, datatype, date_updated, agerange=None, region_parent=None, region_child=None, region_schema=Schemas.ADMIN_1): """ Convenience function for removing numeral grouping X,XXX and returning a number based on a match from re.compile() instance `regex` Multiple regexes can be specified for `regex`, in which case the first match will be returned """ # assert region_parent assert region_child if isinstance(regex, (list, tuple)): for i_regex in regex: dp = self._extract_number_using_regex( i_regex, s, source_url, datatype, date_updated, agerange, region_parent, region_child, region_schema ) if dp: return dp return None match = regex.search(s) # print(regex, match) if match: num = match.group(1) num = num.replace(',', '') if num.isdecimal(): #print(f" Found Match: {match.group()}") num = int(num) if date_updated is None: date_updated = self._todays_date() return DataPoint( region_schema=region_schema, region_parent=region_parent, region_child=region_child, datatype=datatype, agerange=agerange, value=num, date_updated=date_updated, source_url=source_url, text_match=s[ max(0, match.start(1)-40): min(len(s), match.end(1)+40) ] ) return None
def _get_total_source_of_infection(self, url, html): # NOTE: there are also stats at # https://www.covid19.act.gov.au/updates/confirmed-case-information # but they're in a different format - # not sure it's worth supporting them # Normalise it with other states act_norm_map = { 'Overseas acquired': DataTypes.SOURCE_OVERSEAS, 'Cruise ship acquired': DataTypes.SOURCE_CRUISE_SHIP, 'Interstate acquired': DataTypes.SOURCE_INTERSTATE, 'Contact of a confirmed ACT case': DataTypes.SOURCE_CONFIRMED, 'Unknown or local transmission': DataTypes.SOURCE_COMMUNITY, 'Under investigation': DataTypes.SOURCE_UNDER_INVESTIGATION, } du = self._get_date(url, html) r = [] for re_text in ( r'<tr[^>]*><td[^>]*><p[^>]*>Overseas acquired</p></td>' r'<td[^>]*><p[^>]*>(?P<Overseas_acquired>[0-9,]+)</p></td></tr>', # Cruise ship-acquired was only added around 6 April r'<tr[^>]*><td[^>]*><p[^>]*>Cruise ship acquired</p></td>' r'<td[^>]*><p[^>]*>(?P<Cruise_ship_acquired>[0-9,]+) of the [0-9,]+</p></td></tr>', r'<tr[^>]*><td[^>]*><p[^>]*>Interstate acquired</p></td>' r'<td[^>]*><p[^>]*>(?P<Interstate_acquired>[0-9,]+)</p></td></tr>', r'<tr[^>]*><td[^>]*><p[^>]*>Contact of a confirmed ACT case</p></td>' r'<td[^>]*><p[^>]*>(?P<Contact_of_a_confirmed_ACT_case>[0-9,]+)</p></td></tr>', r'<tr[^>]*><td[^>]*><p[^>]*>Unknown / local transmission</p></td>' r'<td[^>]*><p[^>]*>(?P<Unknown_or_local_transmission>[0-9,]+)</p></td></tr>', r'<tr[^>]*><td[^>]*><p[^>]*>Under investigation</p></td>' r'<td[^>]*><p[^>]*>(?P<Under_investigation>[0-9,]+)</p></td></tr>' ): re_soi = compile(re_text, IGNORECASE) match = re_soi.search(html) if match: gd = match.groupdict() for k, v in gd.items(): if v is None: continue r.append( DataPoint(region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-ACT', datatype=act_norm_map[k.replace('_', ' ')], value=int(v.replace(',', '')), date_updated=du, source_url=url)) return r or None
def _get_datapoints(date, path): r = [] with open(path, 'r', encoding='utf-8') as f: data = json.loads(f.read()) if isinstance(data, list): for row in data: for datatype, value in ((DataTypes.STATUS_ACTIVE, row['activedisp']), (DataTypes.TOTAL, row['cases'])): if value == 'Five or fewer active cases': continue r.append( DataPoint(region_schema=Schemas.POSTCODE, region_parent='AU-VIC', region_child=str(row['postcode']), datatype=datatype, value=int(value), date_updated=date, source_url=SOURCE_URL, source_id=SOURCE_ID)) else: for row in data['rows']: for datatype, value in ((DataTypes.STATUS_ACTIVE, row['active']), (DataTypes.TOTAL, row['total'])): r.append( DataPoint(region_schema=Schemas.POSTCODE, region_parent='AU-VIC', region_child=str(row['postcode']), datatype=datatype, value=int(value), date_updated=date, source_url=SOURCE_URL, source_id=SOURCE_ID)) return r
def add_new_datapoints_from_total(self, source_id, new_datatype, total_datatype): print("Adding new datapoints from totals:", source_id) new_datapoints = self.datapoints_db.select_many( source_id=['=?', [source_id]], datatype=['=?', [new_datatype]]) total_datapoints = self.datapoints_db.select_many( source_id=['=?', [source_id]], datatype=['=?', [total_datatype]]) n = {} t = {} for new_datapoint in new_datapoints: n[new_datapoint.date_updated, new_datapoint.region_schema, new_datapoint.region_parent, new_datapoint.region_child, new_datapoint.agerange] = new_datapoint for total_datapoint in total_datapoints: t[total_datapoint.date_updated, total_datapoint.region_schema, total_datapoint.region_parent, total_datapoint.region_child, total_datapoint.agerange] = total_datapoint append_datapoints = [] for k, total_datapoint in t.items(): if k in n: # Already have a new datapoint for this, so don't add! continue day_before = date_fns.apply_timedelta(total_datapoint.date_updated, days=-1) k_pd = (day_before, ) + k[1:] # previous day if not k_pd in t: continue total_datapoint_pd = t[k_pd] append_datapoints.append( DataPoint(region_schema=total_datapoint.region_schema, region_parent=total_datapoint.region_parent, region_child=total_datapoint.region_child, date_updated=total_datapoint.date_updated, datatype=new_datatype, agerange=total_datapoint.agerange, value=total_datapoint.value - total_datapoint_pd.value, source_url='DERIVED')) self.datapoints_db.extend(source_id, append_datapoints, is_derived=True)
def _get_total_source_of_infection(self, url, html): """ Source Cases Overseas acquired 252 Locally acquired (close contact of a confirmed case) 78 Locally acquired (Interstate travel) 7 Locally acquired (contact not identified) 3 Under investigation 27 TOTAL 367 """ html = html.replace(' ', ' ') r = [] du = None # Normalise it with other states sa_norm_map = { 'Overseas acquired': DataTypes.SOURCE_OVERSEAS, 'Locally acquired (Interstate travel)': DataTypes.SOURCE_INTERSTATE, 'Locally acquired (close contact of a confirmed case)': DataTypes.SOURCE_CONFIRMED, 'Locally acquired (contact not identified)': DataTypes.SOURCE_COMMUNITY, 'Under investigation': DataTypes.SOURCE_UNDER_INVESTIGATION } for k in ('Overseas acquired', 'Locally acquired (close contact of a confirmed case)', 'Locally acquired (Interstate travel)', 'Locally acquired (contact not identified)', 'Under investigation'): tr = self._pq_contains(html, 'tr', k, ignore_case=True) if not tr: continue if du is None: du = self._get_date(url, html) tr = tr[0] c_icu = int(pq(tr[1]).text().strip()) r.append( DataPoint(region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-SA', datatype=sa_norm_map[k], value=c_icu, date_updated=du, source_url=url)) return r or None
def _get_recovered_data(self, updated_date, response_dict): r = [] data = response_dict['recovered'][1] recovered = data['result']['data']['dsr']['DS'][0]['PH'][0]['DM0'][0]['M0'] r.append(DataPoint( region_schema=Schemas.ADMIN_1, region_parent='AU', region_child='AU-ACT', datatype=DataTypes.STATUS_RECOVERED, value=self._to_int(recovered), date_updated=updated_date, source_url=self.source_url, source_id=self.SOURCE_ID )) return r