Python DataPoint.DataPoint示例，covid_db.datatypes.DataPoint.DataPoint.DataPoint Python示例

示例#1

0

显示文件

    def _get_from_path(self, path_data_page, date):
        r = []

        with open(path_data_page, 'r', encoding='utf-8') as f:
            for item in csv.DictReader(f):
                r.append(
                    DataPoint(
                        region_schema=Schemas.POSTCODE,
                        region_parent='AU-VIC',
                        region_child=item['Postcode'],
                        datatype=DataTypes.TOTAL,
                        value=int(item['Confirmed cases (ever)'] or 0),
                        date_updated=date,  # FIXME!!!!!
                        source_url=self.SOURCE_URL,
                        source_id=self.SOURCE_ID))
                r.append(
                    DataPoint(
                        region_schema=Schemas.POSTCODE,
                        region_parent='AU-VIC',
                        region_child=item['Postcode'],
                        datatype=DataTypes.STATUS_ACTIVE,
                        value=int(item['Active cases (current)'] or 0),
                        date_updated=date,  # FIXME!!!!!
                        source_url=self.SOURCE_URL,
                        source_id=self.SOURCE_ID))

        return r

示例#2

0

显示文件

文件： THData.py 项目： mcyph/world_subnational_covid_crawler

    def _get_timeline(self):
        # {"UpdateDate":"14\/05\/2020 11:35",
        #  "Source":"https:\/\/covid19.th-stat.com\/",
        #  "DevBy":"https:\/\/www.kidkarnmai.com\/",
        #  "SeverBy":"https:\/\/smilehost.asia\/",
        #  "Data":[{
        #      "Date":"01\/01\/2020",
        #      "NewConfirmed":0,
        #      "NewRecovered":0,
        #      "NewHospitalized":0,
        #      "NewDeaths":0,
        #      "Confirmed":0,
        #      "Recovered":0,
        #      "Hospitalized":0,
        #      "Deaths":0
        #  }, ...
        r = []

        text = self.get_text('timeline.json', include_revision=True)
        data = json.loads(text)

        for item in data['Data']:
            if not item['Date']:
                continue
            date = self.convert_date(item['Date'], formats=('%m/%d/%Y', ))

            r.append(
                DataPoint(region_schema=Schemas.ADMIN_0,
                          region_parent='',
                          region_child='TH',
                          datatype=DataTypes.TOTAL,
                          value=int(item['Confirmed']),
                          date_updated=date,
                          source_url=self.SOURCE_URL))
            r.append(
                DataPoint(region_schema=Schemas.ADMIN_0,
                          region_parent='',
                          region_child='TH',
                          datatype=DataTypes.STATUS_RECOVERED,
                          value=int(item['Recovered']),
                          date_updated=date,
                          source_url=self.SOURCE_URL))
            r.append(
                DataPoint(region_schema=Schemas.ADMIN_0,
                          region_parent='',
                          region_child='TH',
                          datatype=DataTypes.STATUS_HOSPITALIZED,
                          value=int(item['Hospitalized']),
                          date_updated=date,
                          source_url=self.SOURCE_URL))
            r.append(
                DataPoint(region_schema=Schemas.ADMIN_0,
                          region_parent='',
                          region_child='TH',
                          datatype=DataTypes.STATUS_DEATHS,
                          value=int(item['Deaths']),
                          date_updated=date,
                          source_url=self.SOURCE_URL))

        return r

示例#3

0

显示文件

    def _get_total_cases_by_region(self, url, html):
        if url == self.STATS_BY_REGION_URL:
            tables = self._pq_contains(
                html, 'table', 'LGA Region', ignore_case=True) or []
            du = self._get_date(url, html)

            r = []
            for table in tables:
                for lga, num_cases in table[1]:
                    lga = pq(lga).text().strip()

                    if lga.lower() == 'total':
                        # This value is very often out of date!!! ====================================================
                        if False:
                            r.append(
                                DataPoint(region_schema=Schemas.THS,
                                          region_parent='au-tas',
                                          region_child=pq(table[0][0][0]).text(
                                          ).strip().split(' - ')[-1].strip(),
                                          datatype=DataTypes.TOTAL,
                                          value=int(
                                              pq(num_cases).text().replace(
                                                  ',', '').strip()),
                                          date_updated=du,
                                          source_url=url))
                    else:
                        r.append(
                            DataPoint(region_schema=Schemas.LGA,
                                      region_parent='au-tas',
                                      region_child=lga,
                                      datatype=DataTypes.TOTAL,
                                      value=int(
                                          pq(num_cases).text().replace(
                                              ',', '').strip()),
                                      date_updated=du,
                                      source_url=url))
            return r

        else:
            table = self._pq_contains(html,
                                      'table',
                                      'Local Government Area',
                                      ignore_case=True)
            du = self._get_date(url, html)

            r = []
            if table:
                for region_child, lga, num_cases in table[0][1]:
                    r.append(
                        DataPoint(region_schema=Schemas.LGA,
                                  region_parent='au-tas',
                                  region_child=pq(lga).text().strip(),
                                  datatype=DataTypes.TOTAL,
                                  value=int(
                                      pq(num_cases).text().replace(
                                          ',', '').strip()),
                                  date_updated=du,
                                  source_url=url))
            return r

示例#4

0

显示文件

    def _get_datapoints(self, path):
        date = path.name.split('-')[0]
        print(date)

        for path in path.iterdir():
            print(path)
            with open(path, 'r', encoding='utf-8') as f:
                r = []

                text = json.loads(f.read())
                data = self.get_from_multipart(text, 'dataColumns')

                values_by_idx = self.get_recursively(data, 'dataColumns')[0]['dataValues']
                lga_by_idx = self.get_recursively(data, 'dataColumns')[2]['dataValues']

                active_idx = self.get_recursively(data, 'paneColumnsList')[0]['vizPaneColumns'][3]['aliasIndices']
                total_idx = self.get_recursively(data, 'paneColumnsList')[0]['vizPaneColumns'][4]['aliasIndices']

                for _active, lga in zip(active_idx, lga_by_idx[1:]):
                    if _active in (-44, -70, -71):
                        continue
                    elif _active < 0:
                        continue
                        raise Exception(_active)

                    r.append(DataPoint(
                        region_schema=Schemas.LGA,
                        region_parent='AU-SA',
                        region_child=normalize_locality_name(lga),
                        datatype=DataTypes.STATUS_ACTIVE,
                        value=int(values_by_idx[_active]),
                        date_updated=date,
                        source_url=self.SOURCE_URL,
                        source_id=self.SOURCE_ID
                    ))

                for _total, lga in zip(total_idx, lga_by_idx[1:]):
                    if _total == -44:
                        continue
                    elif _total < 0:
                        raise Exception(_total)

                    r.append(DataPoint(
                        region_schema=Schemas.LGA,
                        region_parent='AU-SA',
                        region_child=normalize_locality_name(lga),
                        datatype=DataTypes.TOTAL,
                        value=int(values_by_idx[_total]),
                        date_updated=date,
                        source_url=self.SOURCE_URL,
                        source_id=self.SOURCE_ID
                    ))

                #print(values_by_idx)
                #print(lga_by_idx)
                #pprint(total_idx)

                return r

示例#5

0

显示文件

    def _get_source_of_infection(self, updated_date, response_dict):
        # * Overseas acquired
        # * Cruise ship acquired (included in overseas acquired)
        # * Interstate acquired
        # * Locally acquired - contact of a confirmed case
        # * Locally acquired - contact not identified
        # * Under investigation

        # Normalise it with other states
        vic_norm_map = {
            'Travel overseas': DataTypes.SOURCE_OVERSEAS,
            'Contact with a confirmed case': DataTypes.SOURCE_CONFIRMED,
            'Acquired in Australia, unknown source':
            DataTypes.SOURCE_COMMUNITY,
            'Under investigation': DataTypes.SOURCE_UNDER_INVESTIGATION
        }

        output = []
        data = response_dict['source_of_infection'][1]
        added = set()

        for source in data['result']['data']['dsr']['DS'][0]['PH'][0]['DM0']:
            output.append(
                DataPoint(region_schema=Schemas.ADMIN_1,
                          region_parent='AU',
                          region_child='AU-VIC',
                          datatype=vic_norm_map[source['C'][0]],
                          value=source['C'][1],
                          date_updated=updated_date,
                          source_url=self.SOURCE_URL,
                          source_id=self.SOURCE_ID))
            added.add(vic_norm_map[source['C'][0]])

        for datatype in vic_norm_map.values():
            if datatype in added:
                continue

            # Sometimes "under investigation" isn't provided,
            # but probably can assume at 0 for these days
            output.append(
                DataPoint(region_schema=Schemas.ADMIN_1,
                          region_parent='AU',
                          region_child='AU-VIC',
                          datatype=datatype,
                          value=0,
                          date_updated=updated_date,
                          source_url=self.SOURCE_URL,
                          source_id=self.SOURCE_ID))

        return output

示例#6

0

显示文件

文件： RSData.py 项目： mcyph/world_subnational_covid_crawler

    def get_statistic(self):
        r = []
        data = json.loads(self.get_text('statistic.json', include_revision=True))
        by_region = Counter()

        for region_data in data:
            # {"dataSet":{"id":1,"code":"COVID19","name":"COVID19 statistics",
            # "shortName":"COVID19 stat","sourceUrl":"file:///tmp/covid19stat.txt",
            # "resourceUrl":null,"dataSetGroup":{"id":1,"name":"Default","pos":1,
            # "icon_resource":null},"pos":1,"isComparable":null,"delimiter":";"},
            #print(region_data)

            for point_dict in region_data['points']:
                # {"abscissa":{"id":45862,"year":2020,"month":3,"day":10,
                # "name":"2020-03-10","date":"2020-03-10"},"ordinate":0.0}
                #print(point_dict['abscissa']['date'])
                if point_dict['ordinate'] is None:
                    continue

                region_child = region_map[region_data['name'].lower().strip()]
                value = int(point_dict['ordinate'])
                date = self.convert_date(point_dict['abscissa']['date'])
                by_region[date, region_child] += value

                r.append(DataPoint(
                    region_schema=Schemas.ADMIN_1,
                    region_parent='RS',
                    region_child=region_child,
                    datatype=DataTypes.NEW,
                    value=value,
                    source_url=self.SOURCE_URL,
                    date_updated=date
                ))

        cumulative = Counter()
        for (date, region_child), value in sorted(by_region.items()):
            cumulative[region_child] += value

            r.append(DataPoint(
                region_schema=Schemas.ADMIN_1,
                region_parent='RS',
                region_child=region_child,
                datatype=DataTypes.TOTAL,
                value=cumulative[region_child],
                source_url=self.SOURCE_URL,
                date_updated=date
            ))
        return r

示例#7

0

显示文件

    def _get_total_male_female_breakdown(self, url, html):
        du = self._get_date(url, html)

        regex = compile(
            r'Total cases include ([0-9,]+) men and ([0-9,]+) women')
        match = regex.search(html)
        if match:
            men = int(match.group(1).replace(',', ''))
            women = int(match.group(2).replace(',', ''))

            men = DataPoint(region_schema=Schemas.ADMIN_1,
                            region_parent='au',
                            region_child='au-vic',
                            date_updated=du,
                            datatype=DataTypes.TOTAL_MALE,
                            value=men,
                            source_url=url)
            women = DataPoint(region_schema=Schemas.ADMIN_1,
                              region_parent='au',
                              region_child='au-vic',
                              date_updated=du,
                              datatype=DataTypes.TOTAL_FEMALE,
                              value=women,
                              source_url=url)
            return men, women

        else:
            men = self._extract_number_using_regex(
                compile('total[^0-9.]+?([0-9,]+) men'),
                html,
                region_schema=Schemas.ADMIN_1,
                region_parent='AU',
                region_child='AU-VIC',
                source_url=url,
                datatype=DataTypes.TOTAL_MALE,
                date_updated=du)
            women = self._extract_number_using_regex(
                compile('total[^0-9.]+?([0-9,]+) women'),
                html,
                region_schema=Schemas.ADMIN_1,
                region_parent='AU',
                region_child='AU-VIC',
                source_url=url,
                datatype=DataTypes.TOTAL_FEMALE,
                date_updated=du)
            if men is not None and women is not None:
                return men, women
        return None

示例#8

0

显示文件

文件： WANews.py 项目： mcyph/world_subnational_covid_crawler

    def _get_total_cases_tested(self, url, html):
        neg_cases = self._extract_number_using_regex(
            # Seems the WA website's wording can change day-to-day
            compile(r'([0-9]+[0-9,]*?)'
                    r'([^0-9]*?negative COVID-19 tests|'
                    r'[^0-9]*?tested negative|'
                    r'[^0-9]*?negative)'),
            html,
            region_schema=Schemas.ADMIN_1,
            region_parent='AU',
            region_child='AU-WA',
            source_url=url,
            datatype=DataTypes.TESTS_TOTAL,
            date_updated=self._get_date(url, html))
        pos_cases = self._get_total_cases(url, html)

        if neg_cases and pos_cases:
            return DataPoint(region_schema=Schemas.ADMIN_1,
                             region_parent='AU',
                             region_child='AU-WA',
                             datatype=neg_cases.datatype,
                             value=neg_cases.value + pos_cases.value,
                             date_updated=neg_cases.date_updated,
                             source_url=neg_cases.source_url,
                             text_match=(neg_cases.text_match,
                                         pos_cases.text_match))
        return None

示例#9

0

显示文件

    def _get_total_new_cases(self, href, html):
        c_html = word_to_number(html)

        if 'same total number as yesterday' in html:
            # https://www.dhhs.vic.gov.au/coronavirus-update-victoria-27-april-2020
            return DataPoint(region_schema=Schemas.ADMIN_1,
                             region_parent='AU',
                             region_child='AU-VIC',
                             datatype=DataTypes.NEW,
                             value=0,
                             date_updated=self._get_date(href, html),
                             source_url=href,
                             text_match='same total number as yesterday')

        return self._extract_number_using_regex(
            compile('increase of ([0-9,]+)'),
            c_html,
            region_schema=Schemas.ADMIN_1,
            region_parent='AU',
            region_child='AU-VIC',
            datatype=DataTypes.NEW,
            source_url=href,
            date_updated=self._get_date(
                href, html)) or self._extract_number_using_regex(
                    compile('([0-9,]+) new cases'),
                    c_html,
                    region_schema=Schemas.ADMIN_1,
                    region_parent='AU',
                    region_child='AU-VIC',
                    datatype=DataTypes.NEW,
                    source_url=href,
                    date_updated=self._get_date(href, html))

示例#10

0

显示文件

    def _get_recovered_sum(self):
        r = []
        base_dir = self.get_path_in_dir('')

        for date in self.iter_nonempty_dirs(base_dir):
            path = f'{base_dir}/{date}/is_index.html'
            with open(path, 'rb') as f:
                data = f.read()
                data = data.decode('utf-8')

            # TODO: There are quite a few more stats!!

            regional_stats = data.split('[[[null,{"font-weight":"700","value":"Infections"},'
                                        '{"font-weight":"700","value":"Quarantine"}],')[1].split(']]],')[0]
            #print(regional_stats)
            regional_stats = json.loads(f'[{regional_stats}]]')

            for region, infections_dict, quarantine_dict in regional_stats:
                region = place_map[region]

                r.append(DataPoint(
                    region_schema=Schemas.ADMIN_1,
                    region_parent='IS',
                    region_child=region,
                    datatype=DataTypes.TOTAL,
                    # This changed to be an int from a dict on 9 Jun
                    value=int(infections_dict['value']) if isinstance(infections_dict, dict) else int(infections_dict),
                    date_updated=date,
                    source_url=self.SOURCE_URL
                ))

        return r

示例#11

0

显示文件

文件： QLDNews.py 项目： mcyph/world_subnational_covid_crawler

    def _get_total_cases(self, href, html):
        du = self._get_date(href, html)

        if href in (self.STATS_BY_REGION_URL, self.STATS_BY_REGION_URL_2):
            # New format as of 22 April
            cases = pq(html)('.qh-fact-wrapper .cases span')

            if cases:
                return self._extract_number_using_regex(
                    compile('([0-9,]+)'),
                    pq(cases[0]).text().strip(),
                    region_schema=Schemas.ADMIN_1,
                    region_parent='AU',
                    region_child='AU-QLD',
                    datatype=DataTypes.TOTAL,
                    date_updated=du,
                    source_url=href)
            else:
                return None

        # Use new format from the table if possible
        totals_dict = self.__get_totals_from_table(html)
        if totals_dict:
            return DataPoint(region_schema=Schemas.ADMIN_1,
                             region_parent='AU',
                             region_child='AU-QLD',
                             datatype=DataTypes.TOTAL,
                             value=totals_dict['total'],
                             date_updated=du,
                             source_url=href,
                             text_match=None)

        c_html = word_to_number(html)

        return self._extract_number_using_regex(
            (compile('state total to ([0-9,]+)'),
             compile('total of ([0-9,]+) (?:people|person)')),
            c_html,
            region_schema=Schemas.ADMIN_1,
            region_parent='AU',
            region_child='AU-QLD',
            source_url=href,
            datatype=DataTypes.TOTAL,
            date_updated=du
        ) or self._extract_number_using_regex(
            compile(
                # Total number changed from being enclosed in a <strong>
                # tag to a <b> tag, so changed to be as broad as NSW
                # <strong>Total</strong></td>
                # <td headers="table59454r1c2"><b>37,334‬</b></td>
                r'<td[^>]*?>(?:<[^</>]+>)?Total(?:</[^<>]+>)?</td>'
                r'[^<]*?<td[^>]*?>.*?([0-9,]+).*?</td>',
                MULTILINE | DOTALL),
            c_html,
            region_schema=Schemas.ADMIN_1,
            region_parent='AU',
            region_child='AU-QLD',
            source_url=href,
            datatype=DataTypes.TOTAL,
            date_updated=du)

示例#12

0

显示文件

文件： VicCSV.py 项目： mcyph/world_subnational_covid_crawler

    def _get_all_lga_datapoints(self, date):
        r = []
        current_date = None
        by_agegroup = ExpiringCounter()

        with open(get_data_dir() / 'vic' / 'csv_data' / date / 'all_lga.csv',
                  'r',
                  encoding='utf-8') as f:
            for row in sorted(csv.DictReader(f), key=lambda x: x['diagnosis_date']) + \
                       [{'diagnosis_date': '1111-01-01', 'Localgovernmentarea': None}]:

                date_updated = self.convert_date(row['diagnosis_date'])

                if current_date != date_updated:
                    if current_date is not None:
                        for lga, value in by_agegroup.items():
                            r.append(
                                DataPoint(region_schema=Schemas.LGA,
                                          region_parent='AU-VIC',
                                          region_child=normalize_locality_name(
                                              lga.split('(')[0].strip()),
                                          datatype=DataTypes.TOTAL,
                                          value=int(value),
                                          date_updated=current_date,
                                          source_url=self.SOURCE_URL,
                                          source_id=self.SOURCE_ID))
                    current_date = date_updated

                if row['Localgovernmentarea']:
                    by_agegroup[row['Localgovernmentarea'].strip('_')] += 1
        return r

示例#13

0

显示文件

文件： VicCSV.py 项目： mcyph/world_subnational_covid_crawler

    def _get_agegroup_datapoints(self, date):
        r = []
        current_date = None
        by_agegroup = Counter()

        with open(get_data_dir() / 'vic' / 'csv_data' / date / 'agegroup.csv',
                  'r',
                  encoding='utf-8') as f:
            for row in sorted(csv.DictReader(f), key=lambda x: x['diagnosis_date']) + \
                       [{'diagnosis_date': '1111-01-01', 'agegroup': None}]:

                assert len(
                    row['diagnosis_date']) in (9, 10), row['diagnosis_date']
                date_updated = self.convert_date(row['diagnosis_date'])

                if current_date != date_updated:
                    if current_date is not None:
                        for agerange, value in by_agegroup.items():
                            r.append(
                                DataPoint(region_schema=Schemas.ADMIN_1,
                                          region_parent='AU',
                                          region_child='AU-VIC',
                                          datatype=DataTypes.TOTAL,
                                          agerange=agerange,
                                          value=int(value),
                                          date_updated=current_date,
                                          source_url=self.SOURCE_URL,
                                          source_id=self.SOURCE_ID))
                    current_date = date_updated

                if row['agegroup']:
                    by_agegroup[row['agegroup'].strip('_')] += 1
        return r

示例#14

0

显示文件

    def __get_tests_datapoints(self, SOURCE_URL, path_tests):
        r = DataPointMerger()

        with open(path_tests, 'r', encoding='utf-8') as f:
            for item in json.loads(f.read())['data']:
                try:
                    item['POA_NAME16'] = str(int(float(item['POA_NAME16'])))
                except ValueError:
                    pass

                date = self.__get_partial_date(path_tests, item['Date'])
                number = int(item['Number'])
                # recent = item['Recent'] # TODO: ADD ME!!! ========================================================
                postcode = item['POA_NAME16'] if item['POA_NAME16'] else 'Unknown'

                r.append(DataPoint(
                    region_schema=Schemas.POSTCODE,
                    region_parent='AU-NSW',
                    region_child=postcode,
                    datatype=DataTypes.TESTS_TOTAL,
                    value=number,
                    date_updated=date,
                    source_url=SOURCE_URL,
                    source_id=self.SOURCE_ID
                ))

        return r

示例#15

0

显示文件

文件： QLDNews.py 项目： mcyph/world_subnational_covid_crawler

    def _get_total_source_of_infection(self, url, html):
        norm_map = {
            'Locally Acquired—close contact with confirmed case':
            DataTypes.SOURCE_CONFIRMED,
            'Locally Acquired—no known contact': DataTypes.SOURCE_COMMUNITY,
            'Interstate acquired': DataTypes.SOURCE_INTERSTATE,
            'Overseas acquired': DataTypes.SOURCE_OVERSEAS,
            'Under investigation': DataTypes.SOURCE_UNDER_INVESTIGATION,
        }
        du = self._get_date(url, html)

        if url == self.STATS_BY_REGION_URL_2:
            table = pq(html)('#QLD_Cases_Sources_Of_Infection')[0]
            #print(pq(table).html())

            r = []
            for header, value in table[0]:
                header = pq(header).text().strip()
                if header in ('Confirmed cases', 'Total cases'):
                    continue

                value = pq(value).text().strip()
                r.append(
                    DataPoint(region_schema=Schemas.ADMIN_1,
                              region_parent='AU',
                              region_child='AU-QLD',
                              datatype=norm_map[header],
                              value=int(value.replace(',', '')),
                              date_updated=du,
                              source_url=url))
            return r
        else:
            return []

示例#16

0

显示文件

文件： VicCSV.py 项目： mcyph/world_subnational_covid_crawler

    def _get_lga_datapoints(self, date):
        # LGA	lga_pid	population	active	cases	rate	new	band	LGADisplay	data_date
        # 	Alpine (S)	VIC242	12814	0	1	0	0	0	Alpine	29/08/2020
        # 	Ararat (RC)	VIC220	11845	1	7	8.4	0	1	Ararat	29/08/2020
        # 	Ballarat (C)	VIC241	109505	6	61	5.5	0	1	Ballarat	29/08/2020
        # 	Banyule (C)	VIC188	131631	30	437	22.8	0	2	Banyule	29/08/2020
        # Bass Coast (S) VIC173	36320	0	11	0	0	0	Bass Coast	29/08/2020
        # 	Baw Baw (S)	VIC194	53396	1	15	1.9	0	1	Baw Baw	29/08/2020
        # 	Bayside (C)	VIC182	106862	72	227	67.4	6	3	Bayside	29/08/2020
        # 	Benalla (RC)	VIC199	14037	0	3	0	0	0	Benalla	29/08/2020

        r = []
        print("LGA:", get_data_dir() / 'vic' / 'csv_data' / date)

        with open(get_data_dir() / 'vic' / 'csv_data' / date / 'lga.json',
                  'r',
                  encoding='utf-8') as f:
            for row in csv.DictReader(f):
                #print(row)
                date_updated = self.convert_date(row['data_date'])

                for datatype, value in ((DataTypes.STATUS_ACTIVE,
                                         row['active']), (DataTypes.TOTAL,
                                                          row['cases'])):
                    r.append(
                        DataPoint(region_schema=Schemas.LGA,
                                  region_parent='AU-VIC',
                                  region_child=normalize_locality_name(
                                      row['LGA'].split('(')[0].strip()),
                                  datatype=datatype,
                                  value=int(value),
                                  date_updated=date_updated,
                                  source_url=self.SOURCE_URL,
                                  source_id=self.SOURCE_ID))
        return r

示例#17

0

显示文件

    def __get_merged_datapoint(self, datapoint):
        """
        Find+remove the previous datapoint (if it exists);
        return a new datapoint with both values added
        """
        unique_key = self.__get_unique_key(datapoint)

        if unique_key in self.__datapoint_indexes:
            replace_index = self.__datapoint_indexes[unique_key]
            i = self[replace_index]

            r = DataPoint(region_schema=datapoint.region_schema,
                          region_parent=datapoint.region_parent,
                          region_child=datapoint.region_child,
                          date_updated=datapoint.date_updated,
                          datatype=datapoint.datatype,
                          agerange=datapoint.agerange,
                          value=datapoint.value + i.value,
                          source_url=datapoint.source_url or i.source_url,
                          text_match=datapoint.text_match or i.text_match,
                          source_id=datapoint.source_id)
        else:
            replace_index = None
            r = datapoint

        return replace_index, r

示例#18

0

显示文件

    def _get_regions(self, updated_date, response_dict):
        output = []
        data = response_dict['regions'][1]
        previous_value = None

        for region_child in data['result']['data']['dsr']['DS'][0]['PH'][0][
                'DM0']:
            value, previous_value = self.process_powerbi_value(
                region_child, previous_value, data)
            if value[0] is None:
                continue

            region_string = value[0].split('(')[0].strip()
            output.append(
                DataPoint(region_schema=Schemas.LGA,
                          region_parent='au-vic',
                          region_child=region_string,
                          datatype=DataTypes.TOTAL,
                          value=value[1],
                          date_updated=updated_date,
                          source_url=self.SOURCE_URL,
                          source_id=self.SOURCE_ID))
            previous_value = value
            # print(output[-1])

            self.totals_dict[region_string] = value[1]

        return output

示例#19

0

显示文件

文件： QLDNews.py 项目： mcyph/world_subnational_covid_crawler

    def _get_total_age_breakdown(self, href, html):
        if href == self.STATS_BY_REGION_URL_2:
            r = []
            table = pq(html)('#QLD_CasesByAgeAndGender')[0][1]
            du = self._get_date(href, html)

            for tr in table[1:]:
                age_group = pq(tr[0]).text().strip()
                female = int(pq(tr[1]).text().replace(',', ''))
                male = int(pq(tr[2]).text().replace(',', ''))
                total = int(pq(tr[3]).text().replace(',', ''))

                for datatype, value in ((DataTypes.TOTAL_FEMALE,
                                         female), (DataTypes.TOTAL_MALE, male),
                                        (DataTypes.TOTAL, total)):
                    if value is None:
                        continue
                    r.append(
                        DataPoint(region_schema=Schemas.ADMIN_1,
                                  region_parent='AU',
                                  region_child='AU-QLD',
                                  datatype=datatype,
                                  agerange=age_group,
                                  value=value,
                                  date_updated=du,
                                  source_url=href))
            return r

示例#20

0

显示文件

    def _get_total_age_breakdown(self, href, html):
        table = self._pq_contains(html,
                                  'table',
                                  'By age group',
                                  ignore_case=True)
        if not table:
            return  # WARNING!!! =======================================================================================

        du = self._get_date(href, html)
        table = table[0]
        tbody = pq(table)('tbody')[0]
        tr = tbody[1]
        ages = [
            int(i.replace(',', '').strip()) for i in pq(tr).text().split('\n')
        ]
        ages = {
            '0-29': ages[0],
            '30-39': ages[1],
            '40-49': ages[2],
            '50-59': ages[3],
            '60-69': ages[4],
            '70+': ages[5]
        }
        r = []
        for k, v in ages.items():
            r.append(
                DataPoint(region_schema=Schemas.ADMIN_1,
                          region_parent='AU',
                          region_child='AU-ACT',
                          datatype=DataTypes.TOTAL,
                          agerange=k,
                          value=v,
                          date_updated=du,
                          source_url=href))
        return r

示例#21

0

显示文件

文件： VicCSV.py 项目： mcyph/world_subnational_covid_crawler

    def _get_postcode_datapoints(self, date):
        # postcode	population	active	cases	rate	new	band	data_date
        # 	3000	37979	18	119	47.4	0	2	29/08/2020
        # 	3001	0	0	1	0	0	0	29/08/2020
        # 	3002	4957	2	14	40.3	0	2	29/08/2020
        # 	3003	5516	3	36	54.4	0	3	29/08/2020
        # 	3004	9311	6	63	64.4	2	3	29/08/2020
        # 	3005	523	0	0	0	0	0	29/08/2020
        # 	3006	18811	1	64	5.3	0	1	29/08/2020
        # 	3008	10438	2	49	19.2	0	1	29/08/2020
        # 	3010	1595	0	0	0	0	0	29/08/2020
        # 	3011	21464	36	164	167.7	2	4	29/08/2020

        r = []
        print("PostCode:", get_data_dir() / 'vic' / 'csv_data' / date)

        with open(get_data_dir() / 'vic' / 'csv_data' / date / 'postcode.json',
                  'r',
                  encoding='utf-8') as f:
            for row in csv.DictReader(f):
                date_updated = self.convert_date(row['data_date'])

                for datatype, value in ((DataTypes.STATUS_ACTIVE,
                                         row['active']), (DataTypes.TOTAL,
                                                          row['cases'])):
                    r.append(
                        DataPoint(region_schema=Schemas.POSTCODE,
                                  region_parent='AU-VIC',
                                  region_child=row['postcode'],
                                  datatype=datatype,
                                  value=int(value),
                                  date_updated=date_updated,
                                  source_url=self.SOURCE_URL,
                                  source_id=self.SOURCE_ID))
        return r

示例#22

0

显示文件

    def _get_gender_balance_data(self, updated_date, response_dict):
        r = []
        try:
            data = response_dict['gender_balance'][1]
        except KeyError:
            return [] # WARNING!!! ==================================================================================

        # WARNING: This sometimes has another query before it!!! =======================================================
        try:
            m_f = data['result']['data']['dsr']['DS'][0]['PH'][0]['DM0']

            assert m_f[0]['C'][0] in ('Males', 'Male')
            assert m_f[1]['C'][0] in ('Females', 'Female')
        except:
            m_f = data['result']['data']['dsr']['DS'][0]['PH'][0]['DM0']

            assert m_f[0]['C'][0] in ('Males', 'Male')
            assert m_f[1]['C'][0] in ('Females', 'Female')

        male = m_f[0]['C'][1]
        try:
            female = m_f[1]['C'][1]
        except IndexError:
            assert m_f[1]['R']
            female = male

        r.append(DataPoint(
            region_schema=Schemas.ADMIN_1,
            region_parent='AU',
            region_child='AU-ACT',
            datatype=DataTypes.TOTAL_MALE,
            value=self._to_int(male),
            date_updated=updated_date,
            source_url=self.source_url,
            source_id=self.SOURCE_ID
        ))
        r.append(DataPoint(
            region_schema=Schemas.ADMIN_1,
            region_parent='AU',
            region_child='AU-ACT',
            datatype=DataTypes.TOTAL_FEMALE,
            value=self._to_int(female),
            date_updated=updated_date,
            source_url=self.source_url,
            source_id=self.SOURCE_ID
        ))
        return r

示例#23

0

显示文件

    def __postcode_datapoints_to_lga(self, SOURCE_URL, postcode_to_lga, r, source_id):
        # Convert postcode to LGA where possible
        new_r = DataPointMerger()
        added_to_lga = set()
        processed_postcode = set()
        mapping = Counter()

        for datapoint in sorted(r, key=lambda i: i.date_updated):
            if datapoint.region_schema == Schemas.LGA:
                added_to_lga.add((
                    datapoint.region_child,
                    datapoint.datatype
                ))
                continue
            elif datapoint.region_schema != Schemas.POSTCODE:
                continue
            elif datapoint.region_child in postcode_to_lga:
                lga = postcode_to_lga[datapoint.region_child]
            else:
                lga = 'unknown'
                if datapoint.region_child != 'unknown':
                    print("NOT FOUND:", datapoint.region_child)
                # continue  # WARNINIG!!! ================================================================================

            if (datapoint.region_child, datapoint.datatype, datapoint.date_updated) in processed_postcode:
                #print("IGNORING DOUBLE-UP:", datapoint)
                continue
            processed_postcode.add((datapoint.region_child, datapoint.datatype, datapoint.date_updated))

            #if lga == 'cumberland':
            #    print('USING:', datapoint)

            mapping[
                lga,
                datapoint.datatype,
                datapoint.date_updated
            ] += datapoint.value

        new_r.extend(r)

        for (lga, datatype, date_updated), value in mapping.items():
            if (lga, datatype) in added_to_lga:
                # Don't add to LGA if available using direct data!
                continue

            new_r.append(DataPoint(
                region_schema=Schemas.LGA,
                region_parent='AU-NSW',
                region_child=lga,
                datatype=datatype,
                value=value,
                date_updated=date_updated,
                source_url=SOURCE_URL,
                source_id=source_id
            ))

        return new_r

示例#24

0

显示文件

文件： NSWNews.py 项目： mcyph/world_subnational_covid_crawler

    def _get_total_age_breakdown(self, href, html):
        # TODO: TRANSITION TO https://data.nsw.gov.au/nsw-covid-19-data !! =============================================

        if '20200316_02.aspx' in href:
            # HACK: The very first entry was in a different format with percentages
            #  Maybe I could fix this later, but not sure it's worth it
            return None

        r = []
        table = self._pq_contains(
            html, 'table', 'Age Group',
            ignore_case=True
        )
        if not table:
            return None
        table = table[0]
        du = self._get_date(href, html)

        for age_group in (
            '0-9',
            '10-19',
            '20-29',
            '30-39',
            '40-49',
            '50-59',
            '60-69',
            '70-79',
            '80-89',
            '90-100'
        ):
            tds = self._pq_contains(table, 'tr', age_group)
            if not tds:
                continue
            tds = tds[0]

            female = int(pq(tds[1]).text().strip() or 0)
            male = int(pq(tds[2]).text().strip() or 0)
            total = int(pq(tds[3]).text().replace(' ', '').strip() or 0)

            for datatype, value in (
                (DataTypes.TOTAL_FEMALE, female),
                (DataTypes.TOTAL_MALE, male),
                (DataTypes.TOTAL, total)
            ):
                if value is None:
                    continue
                r.append(DataPoint(
                    region_schema=Schemas.ADMIN_1,
                    region_parent='AU',
                    region_child='AU-NSW',
                    datatype=datatype,
                    agerange=age_group,
                    value=value,
                    date_updated=du,
                    source_url=href
                ))
        return r

示例#25

0

显示文件

文件： StateNewsBase.py 项目： mcyph/world_subnational_covid_crawler

    def _extract_number_using_regex(self, regex, s, source_url, datatype,
                                    date_updated, agerange=None,
                                    region_parent=None,
                                    region_child=None,
                                    region_schema=Schemas.ADMIN_1):
        """
        Convenience function for removing numeral grouping X,XXX
        and returning a number based on a match from re.compile()
        instance `regex`

        Multiple regexes can be specified for `regex`, in which
        case the first match will be returned
        """
        #
        assert region_parent
        assert region_child

        if isinstance(regex, (list, tuple)):
            for i_regex in regex:
                dp = self._extract_number_using_regex(
                    i_regex, s, source_url, datatype,
                    date_updated, agerange,
                    region_parent, region_child, region_schema
                )
                if dp:
                    return dp
            return None

        match = regex.search(s)
        # print(regex, match)
        if match:
            num = match.group(1)
            num = num.replace(',', '')

            if num.isdecimal():
                #print(f"    Found Match: {match.group()}")
                num = int(num)
                if date_updated is None:
                    date_updated = self._todays_date()

                return DataPoint(
                    region_schema=region_schema,
                    region_parent=region_parent,
                    region_child=region_child,
                    datatype=datatype,
                    agerange=agerange,
                    value=num,
                    date_updated=date_updated,
                    source_url=source_url,
                    text_match=s[
                        max(0, match.start(1)-40):
                        min(len(s), match.end(1)+40)
                    ]
                )
        return None

示例#26

0

显示文件

    def _get_total_source_of_infection(self, url, html):
        # NOTE: there are also stats at
        # https://www.covid19.act.gov.au/updates/confirmed-case-information
        # but they're in a different format -
        # not sure it's worth supporting them

        # Normalise it with other states
        act_norm_map = {
            'Overseas acquired': DataTypes.SOURCE_OVERSEAS,
            'Cruise ship acquired': DataTypes.SOURCE_CRUISE_SHIP,
            'Interstate acquired': DataTypes.SOURCE_INTERSTATE,
            'Contact of a confirmed ACT case': DataTypes.SOURCE_CONFIRMED,
            'Unknown or local transmission': DataTypes.SOURCE_COMMUNITY,
            'Under investigation': DataTypes.SOURCE_UNDER_INVESTIGATION,
        }
        du = self._get_date(url, html)

        r = []
        for re_text in (
                r'<tr[^>]*><td[^>]*><p[^>]*>Overseas acquired</p></td>'
                r'<td[^>]*><p[^>]*>(?P<Overseas_acquired>[0-9,]+)</p></td></tr>',
                # Cruise ship-acquired was only added around 6 April
                r'<tr[^>]*><td[^>]*><p[^>]*>Cruise ship acquired</p></td>'
                r'<td[^>]*><p[^>]*>(?P<Cruise_ship_acquired>[0-9,]+) of the [0-9,]+</p></td></tr>',
                r'<tr[^>]*><td[^>]*><p[^>]*>Interstate acquired</p></td>'
                r'<td[^>]*><p[^>]*>(?P<Interstate_acquired>[0-9,]+)</p></td></tr>',
                r'<tr[^>]*><td[^>]*><p[^>]*>Contact of a confirmed ACT case</p></td>'
                r'<td[^>]*><p[^>]*>(?P<Contact_of_a_confirmed_ACT_case>[0-9,]+)</p></td></tr>',
                r'<tr[^>]*><td[^>]*><p[^>]*>Unknown / local transmission</p></td>'
                r'<td[^>]*><p[^>]*>(?P<Unknown_or_local_transmission>[0-9,]+)</p></td></tr>',
                r'<tr[^>]*><td[^>]*><p[^>]*>Under investigation</p></td>'
                r'<td[^>]*><p[^>]*>(?P<Under_investigation>[0-9,]+)</p></td></tr>'
        ):
            re_soi = compile(re_text, IGNORECASE)

            match = re_soi.search(html)
            if match:
                gd = match.groupdict()

                for k, v in gd.items():
                    if v is None:
                        continue

                    r.append(
                        DataPoint(region_schema=Schemas.ADMIN_1,
                                  region_parent='AU',
                                  region_child='AU-ACT',
                                  datatype=act_norm_map[k.replace('_', ' ')],
                                  value=int(v.replace(',', '')),
                                  date_updated=du,
                                  source_url=url))

        return r or None

示例#27

0

显示文件

文件： vic_carto.py 项目： mcyph/world_subnational_covid_crawler

def _get_datapoints(date, path):
    r = []

    with open(path, 'r', encoding='utf-8') as f:
        data = json.loads(f.read())

        if isinstance(data, list):
            for row in data:
                for datatype, value in ((DataTypes.STATUS_ACTIVE,
                                         row['activedisp']), (DataTypes.TOTAL,
                                                              row['cases'])):
                    if value == 'Five or fewer active cases':
                        continue

                    r.append(
                        DataPoint(region_schema=Schemas.POSTCODE,
                                  region_parent='AU-VIC',
                                  region_child=str(row['postcode']),
                                  datatype=datatype,
                                  value=int(value),
                                  date_updated=date,
                                  source_url=SOURCE_URL,
                                  source_id=SOURCE_ID))
        else:
            for row in data['rows']:
                for datatype, value in ((DataTypes.STATUS_ACTIVE,
                                         row['active']), (DataTypes.TOTAL,
                                                          row['total'])):
                    r.append(
                        DataPoint(region_schema=Schemas.POSTCODE,
                                  region_parent='AU-VIC',
                                  region_child=str(row['postcode']),
                                  datatype=datatype,
                                  value=int(value),
                                  date_updated=date,
                                  source_url=SOURCE_URL,
                                  source_id=SOURCE_ID))

    return r

示例#28

0

显示文件

    def add_new_datapoints_from_total(self, source_id, new_datatype,
                                      total_datatype):
        print("Adding new datapoints from totals:", source_id)

        new_datapoints = self.datapoints_db.select_many(
            source_id=['=?', [source_id]], datatype=['=?', [new_datatype]])
        total_datapoints = self.datapoints_db.select_many(
            source_id=['=?', [source_id]], datatype=['=?', [total_datatype]])

        n = {}
        t = {}

        for new_datapoint in new_datapoints:
            n[new_datapoint.date_updated, new_datapoint.region_schema,
              new_datapoint.region_parent, new_datapoint.region_child,
              new_datapoint.agerange] = new_datapoint

        for total_datapoint in total_datapoints:
            t[total_datapoint.date_updated, total_datapoint.region_schema,
              total_datapoint.region_parent, total_datapoint.region_child,
              total_datapoint.agerange] = total_datapoint

        append_datapoints = []
        for k, total_datapoint in t.items():
            if k in n:
                # Already have a new datapoint for this, so don't add!
                continue

            day_before = date_fns.apply_timedelta(total_datapoint.date_updated,
                                                  days=-1)
            k_pd = (day_before, ) + k[1:]  # previous day
            if not k_pd in t:
                continue
            total_datapoint_pd = t[k_pd]

            append_datapoints.append(
                DataPoint(region_schema=total_datapoint.region_schema,
                          region_parent=total_datapoint.region_parent,
                          region_child=total_datapoint.region_child,
                          date_updated=total_datapoint.date_updated,
                          datatype=new_datatype,
                          agerange=total_datapoint.agerange,
                          value=total_datapoint.value -
                          total_datapoint_pd.value,
                          source_url='DERIVED'))

        self.datapoints_db.extend(source_id,
                                  append_datapoints,
                                  is_derived=True)

示例#29

0

显示文件

文件： SANews.py 项目： mcyph/world_subnational_covid_crawler

    def _get_total_source_of_infection(self, url, html):
        """
        Source 	Cases
        Overseas acquired 252
        Locally acquired (close contact of a confirmed case) 78
        Locally acquired (Interstate travel) 7
        Locally acquired (contact not identified) 3
        Under investigation 27
        TOTAL 367
        """
        html = html.replace('&nbsp;', ' ')
        r = []
        du = None

        # Normalise it with other states
        sa_norm_map = {
            'Overseas acquired': DataTypes.SOURCE_OVERSEAS,
            'Locally acquired (Interstate travel)':
            DataTypes.SOURCE_INTERSTATE,
            'Locally acquired (close contact of a confirmed case)':
            DataTypes.SOURCE_CONFIRMED,
            'Locally acquired (contact not identified)':
            DataTypes.SOURCE_COMMUNITY,
            'Under investigation': DataTypes.SOURCE_UNDER_INVESTIGATION
        }

        for k in ('Overseas acquired',
                  'Locally acquired (close contact of a confirmed case)',
                  'Locally acquired (Interstate travel)',
                  'Locally acquired (contact not identified)',
                  'Under investigation'):
            tr = self._pq_contains(html, 'tr', k, ignore_case=True)
            if not tr:
                continue
            if du is None:
                du = self._get_date(url, html)

            tr = tr[0]
            c_icu = int(pq(tr[1]).text().strip())

            r.append(
                DataPoint(region_schema=Schemas.ADMIN_1,
                          region_parent='AU',
                          region_child='AU-SA',
                          datatype=sa_norm_map[k],
                          value=c_icu,
                          date_updated=du,
                          source_url=url))
        return r or None

示例#30

0

显示文件

    def _get_recovered_data(self, updated_date, response_dict):
        r = []
        data = response_dict['recovered'][1]

        recovered = data['result']['data']['dsr']['DS'][0]['PH'][0]['DM0'][0]['M0']
        r.append(DataPoint(
            region_schema=Schemas.ADMIN_1,
            region_parent='AU',
            region_child='AU-ACT',
            datatype=DataTypes.STATUS_RECOVERED,
            value=self._to_int(recovered),
            date_updated=updated_date,
            source_url=self.source_url,
            source_id=self.SOURCE_ID
        ))
        return r