Python zipContextManager 예제들, fetcher.extras.common.zipContextManager Python 예제들

예제 #1

0

파일 보기

파일: positivity.py 프로젝트: mgithub46/covid19-datafetcher

def handle_ut(res, mapping):
    tagged = []
    prefix = "Overview_Total People Tested Seven-Day Rolling Average Percent Positive Rates by Specimen Collection"
    with zipContextManager(res[-1]) as zipdir:
        with os.scandir(zipdir) as it:
            for entry in it:
                if entry.is_file and entry.name.startswith(prefix):
                    df = pd.read_csv(os.path.join(zipdir, entry.name),
                                     parse_dates=['Collection Date'])
                    df = df.rename(columns=mapping)
                    df['UNITS'] = 'People'
                    ppr = df.loc[:, ['TIMESTAMP', 'PPR', 'UNITS']]
                    ppr['WINDOW'] = 'Week'
                    ppr['SID'] = 'ut-1'
                    tagged.extend(ppr.to_dict(orient='records'))

                    # add the daily values
                    totals = df.loc[:, ['TIMESTAMP', 'UNITS', 'POSITIVE']]
                    totals['TOTAL'] = df['POSITIVE'] + df['NEGATIVE']
                    totals['WINDOW'] = 'Day'
                    totals['SID'] = 'ut-2'
                    tagged.extend(totals.to_dict(orient='records'))

                    break
    return tagged

예제 #2

0

파일 보기

파일: positivity.py 프로젝트: mgithub46/covid19-datafetcher

def handle_ga(res, mapping):
    tagged = []
    filename = "pcr_positives.csv"
    with zipContextManager(res[-1]) as zipdir:
        df = pd.read_csv(open(os.path.join(zipdir, filename), 'r'),
                         parse_dates=['report_date'])
        df = df[df['county'] == 'Georgia']
        sid = 1

        def get_sid():
            return "ga-{}".format(sid)

        # alltime/daily
        latest = df.sort_values('report_date').iloc[-1]
        # daily
        tagged.append({
            'TOTAL': latest['ALL PCR tests performed'],
            'POSITIVE': latest['All PCR positive tests'],
            'TIMESTAMP': latest['report_date'],
            'WINDOW': 'Day',
            'UNITS': 'Tests',
            'SID': get_sid(),
        })
        # all time
        sid += 1
        tagged.append({
            'TOTAL': latest['Running total of all PCR tests'],
            'POSITIVE': latest['Running total of all PCR tests.1'],
            'TIMESTAMP': latest['report_date'],
            'WINDOW': 'Alltime',
            'UNITS': 'Tests',
            'SID': get_sid(),
        })

        # separate it to 7 & 14 rates
        windows = {
            'Week': '7 day percent positive',
            '14Days': '14 day percent positive'
        }
        for window, column in windows.items():
            sid += 1
            pct = df.filter(
                mapping.keys()).rename(columns=mapping).drop(columns='PPR')
            pct['PPR'] = pd.to_numeric(df[column], errors='coerce')
            pct['WINDOW'] = window
            pct['UNITS'] = 'Tests'
            pct['SID'] = get_sid()
            tagged.append(pct.to_dict(orient='records'))

    return tagged

예제 #3

0

파일 보기

파일: backfill.py 프로젝트: COVID19Tracking/covid19-datafetcher

def handle_ga(res, mapping):
    tagged = []
    file_mapping = build_leveled_mapping(mapping)
    with zipContextManager(res[0]) as zipdir:
        for filename in file_mapping.keys():
            date_fields = [
                k for k, v in file_mapping[filename].items()
                if v == 'TIMESTAMP'
            ]
            df = pd.read_csv(os.path.join(zipdir, filename),
                             parse_dates=date_fields)
            df = df[df['county'] == 'Georgia']
            by_date = file_mapping[filename].pop(DATE_USED)
            df = df.rename(columns=file_mapping[filename])
            df[DATE_USED] = by_date
            tagged.extend(df.to_dict(orient='records'))
    return tagged

예제 #4

0

파일 보기

파일: states.py 프로젝트: griffindvs/covid19-datafetcher

def handle_ga(res, mapping):
    tagged = {}
    for result in res[:-1]:
        partial = extract_arcgis_attributes(result, mapping, debug_state='GA')
        tagged.update(partial)
    tagged[Fields.CURR_HOSP.name] += tagged.pop('CURR_HOSP_PUI')

    # last item is zip
    files = ["total_testing.csv", "summary_totals.csv"]
    with zipContextManager(res[-1]) as zipdir:
        for filename in files:
            summary = csv.DictReader(open(os.path.join(zipdir, filename), 'r'))
            summary = list(summary)
            summary = summary[-1]
            partial = map_attributes(summary, mapping, 'GA')
            tagged.update(partial)

    return tagged

예제 #5

0

파일 보기

파일: backfill.py 프로젝트: COVID19Tracking/covid19-datafetcher

def handle_ut(res, mapping):
    zipurl = res[-1]
    mapped = []
    tab_mapping = build_leveled_mapping(mapping)

    def find_entry_mapping(name, tab_mappings):
        for x in tab_mappings.keys():
            if name.startswith(x):
                return tab_mappings[x]
        return None

    with zipContextManager(zipurl) as zipdir:
        with os.scandir(zipdir) as it:
            for entry in it:
                entry_mapping = find_entry_mapping(entry.name, tab_mapping)
                if not entry_mapping:
                    continue
                df = pd.read_csv(os.path.join(zipdir, entry.name))
                cumulative = any([x.find('umulative') > 0 for x in df.columns])
                df = df.rename(columns=entry_mapping).set_index(DATE)
                df.index = pd.to_datetime(df.index)
                # 1. Special handling for testing files
                if 'Test Type' in df.columns:
                    df = df.pivot(columns=['Test Type', 'Result'],
                                  values='Count')
                    df.columns = df.columns.map("-".join)
                    df = df.rename(columns=entry_mapping).sort_index()

                    # sum columns
                    df = df.groupby(df.columns.values, axis=1).sum()

                # 2. Decide whether cumulative or not
                if not cumulative:
                    df = df.sort_index().cumsum()

                # 3. Add DATE_USED + administrativia
                df[TS] = df.index
                df[DATE_USED] = entry_mapping[DATE_USED]

                mapped.extend(df.to_dict(orient='records'))

    return mapped

예제 #6

0

파일 보기

파일: backfill.py 프로젝트: hmhoffman/covid19-datafetcher

def handle_ga(res, mapping):
    tagged = []
    file_mapping = build_leveled_mapping(mapping)
    with zipContextManager(res[0]) as zipdir:
        for filename in file_mapping.keys():
            date_fields = [
                k for k, v in file_mapping[filename].items()
                if v == 'TIMESTAMP'
            ]
            df = pd.read_csv(os.path.join(zipdir, filename),
                             parse_dates=date_fields)
            # funny stuff:
            if filename.startswith('pcr_positive'):
                # the columns have the same name #facepalm
                df.columns = [
                    'county', 'TIMESTAMP', '_', 'SPECIMENS', '_',
                    'SPECIMENS_POS', '_', '_'
                ]
            df = df[df['county'] == 'Georgia']
            by_date = file_mapping[filename].pop(DATE_USED)
            df = df.rename(columns=file_mapping[filename])
            df[DATE_USED] = by_date
            tagged.extend(df.to_dict(orient='records'))
    return tagged

예제 #7

0

파일 보기

파일: states.py 프로젝트: griffindvs/covid19-datafetcher

def handle_ut(res, mapping):
    tagged = {}
    soup_start = 1
    for result in res[:soup_start]:
        partial = extract_arcgis_attributes(result, mapping, 'UT')
        tagged.update(partial)

    stats = res[1]
    for k, v in mapping.items():
        x = stats.find(id=k)
        if x:
            value_item = x.find(class_='value')
            if not value_item:
                value_item = x.find(class_='value-output')
            if not value_item:
                continue
            value = atoi(value_item.get_text(strip=True))
            tagged[v] = value

    # inverse mapping
    revmap = {v: k for k, v in mapping.items()}
    hosp = res[2]
    tables = hosp.find_all('table')

    curr_hosp_table = tables[0]
    tds = curr_hosp_table.find_all('td',
                                   string=re.compile(
                                       revmap[Fields.CURR_HOSP.name]))
    curr_hosp = 0
    for td in tds:
        for x in td.next_siblings:
            if x.name == 'td':
                curr_hosp += atoi(x.get_text(strip=True))
    tagged[Fields.CURR_HOSP.name] = curr_hosp

    # TODO: code here can be improved, combined with top part
    td = curr_hosp_table.find('td',
                              string=re.compile(revmap[Fields.CURR_ICU.name]))
    for x in td.next_siblings:
        if x.name == 'td':
            val = atoi(x.get_text(strip=True))
            tagged[Fields.CURR_ICU.name] = val

    for t in tables[1:]:
        if t.caption.get_text(strip=True) in mapping:
            td = t.find_all('td', limit=2)[1]
            tagged[mapping[t.caption.get_text(strip=True)]] = atoi(
                td.get_text(strip=True))

    # Downloadable file
    zipurl = res[-1]
    # Sometimes there are files for multiple dates, we need the most recent
    specimens_file_prefix = 'Overview_Total Tests by'
    specimens_file_latest = specimens_file_prefix
    recovered_file = 'Overview_Cumulative COVID-19 Cases'
    recovered_file_latest = recovered_file
    people_tested_file = 'Overview_Number of People Tested by'
    people_tested_latest = people_tested_file
    test_type = ['PCR/amplification', 'Antigen by DFA/IF']
    result = ['POSITIVE', 'NEGATIVE']
    with zipContextManager(zipurl) as zipdir:
        with os.scandir(zipdir) as it:
            for entry in it:
                df = None
                fields = []
                if not entry.is_file:
                    # just in case
                    continue
                if entry.name.startswith(specimens_file_prefix):
                    if entry.name < specimens_file_latest:
                        continue
                    # specimens
                    fields = [
                        Fields.SPECIMENS_POS, Fields.SPECIMENS_NEG,
                        Fields.ANTIGEN_POS, Fields.ANTIGEN_NEG
                    ]
                    specimens_file_latest = entry.name
                elif entry.name.startswith(people_tested_file):
                    if entry.name < people_tested_latest:
                        continue
                    # people tested
                    fields = [
                        Fields.CONFIRMED, Fields.NEGATIVE,
                        Fields.ANTIGEN_POS_PEOPLE, Fields.ANTIGEN_NEG_PEOPLE,
                        Fields.TOTAL, Fields.ANTIGEN_TOTAL_PEOPLE
                    ]
                    people_tested_latest = entry.name
                elif entry.name.startswith(recovered_file):
                    if entry.name < recovered_file_latest:
                        continue
                    # recoveries
                    fields = [Fields.RECOVERED]
                    recovered_file_latest = entry.name
                if fields and entry.name.startswith(recovered_file):
                    df = pd.read_csv(os.path.join(zipdir, entry.name))
                    last = df['Estimated Recovered *'].iloc[-1]
                    if Fields.RECOVERED in fields:
                        tagged[Fields.RECOVERED.name] = last
                elif fields and not entry.name.startswith(recovered_file):
                    df = pd.read_csv(os.path.join(zipdir, entry.name))
                    summed = df.groupby(['Test Type', 'Result']).sum()
                    i = 0
                    for tt in test_type:
                        for rr in result:
                            tag = fields[i]
                            tag = tag if isinstance(tag, str) else tag.name
                            value = summed.loc[tt, rr]['Count']
                            tagged[tag] = value
                            i += 1
                    # handle totals
                    if Fields.CONFIRMED in fields:
                        tagged[Fields.TOTAL.name] = sum([
                            summed.loc[test_type[0], rr]['Count']
                            for rr in result
                        ])
    return tagged