def fetch(
     self,
     output_folder: Path,
     cache: Dict[str, str],
     fetch_opts: List[Dict[str, Any]],
     skip_existing: bool = False,
 ) -> Dict[str, str]:
     # The link to the spreadsheet changes daily, so we parse the HTML to find the link every
     # time and download the latest version
     buffer = BytesIO()
     src_opts = fetch_opts[0]
     download(src_opts["url"], buffer)
     page = BeautifulSoup(buffer.getvalue().decode("utf8"), "lxml")
     for link in page.findAll("a"):
         if "href" in link.attrs and link.attrs.get("href").endswith(
                 "xlsx"):
             href = link.attrs.get("href")
             if href.startswith("/"):
                 href = "https://" + src_opts["url"].split("//")[1].split(
                     "/")[0] + href
             return [
                 download_snapshot(href, output_folder,
                                   **src_opts.get("opts"))
             ]
     raise RuntimeError("No link to XLSX file found in page")
 def _pull_source(cache_source: Dict[str, str]):
     url = cache_source.pop("url")
     output = cache_source.pop("output")
     buffer = BytesIO()
     try:
         download(url, buffer)
         with (output_folder / output).open("wb") as fd:
             fd.write(buffer.getvalue())
     except:
         print(f"Cache pull failed for {url}")
         traceback.print_exc()
Пример #3
0
 def _pull_source(cache_source: Dict[str, str]):
     url = cache_source.pop("url")
     output = cache_source.pop("output")
     logger.log_info(f"Downloading {url} into {output}")
     buffer = BytesIO()
     try:
         download(url, buffer)
         with (output_folder / output).open("wb") as fd:
             fd.write(buffer.getvalue())
         logger.log_info(f"Downloaded {output} successfully")
     except:
         logger.log_error(f"Cache pull failed for {url}.", traceback=traceback.format_exc())
Пример #4
0
def main(output_folder: Path,
         only: List[str] = None,
         exclude: List[str] = None):

    # Perform a dry-run to update the data using the current configuration
    update_data(output_folder, only=only, exclude=exclude)

    # Download all the tables from the prod server to local storage
    output_tables = list((output_folder / "tables").glob("*.csv"))
    tables_summary = {table_path.stem: {} for table_path in output_tables}
    for table_path in output_tables:
        table_name = table_path.stem
        table_path_str = str(table_path)
        tables_summary[table_name]["local_curr"] = table_path_str
        local_prod = Path(table_path_str.replace(".csv", ".prod.csv"))
        with local_prod.open(mode="wb") as fd:
            try:
                download(f"{URL_OUTPUTS_PROD}/{table_path.name}", fd)
                tables_summary[table_name]["local_prod"] = str(local_prod)
            except:
                tables_summary[table_name]["local_prod"] = None

    # Compare the new vs prod data
    for table_name, table_data in tables_summary.items():

        # Read both tables into memory
        curr_df = read_file(table_data["local_curr"])
        if table_data["local_prod"] is None:
            prod_df = DataFrame(columns=curr_df.columns)
        else:
            prod_df = read_file(table_data["local_prod"])

        # Compare the number of records
        table_data["records"] = f"{len(curr_df) - len(prod_df):+d}"

        # Compare the columns
        table_data["columns"] = compare_sets(set(curr_df.columns),
                                             set(prod_df.columns))

        # Compare the keys
        if "key" in curr_df.columns:
            table_data["keys"] = compare_sets(set(curr_df.key.unique()),
                                              set(prod_df.key.unique()))

        # Compare the dates
        if "date" in curr_df.columns:
            table_data["dates"] = compare_sets(set(curr_df.date.unique()),
                                               set(prod_df.date.unique()))

    # Return the summary of changes
    return tables_summary
Пример #5
0
    def parse(self, sources: List[str], aux: Dict[str, DataFrame],
              **parse_opts):

        buffer = BytesIO()
        download(sources[0], buffer, progress=True)

        data = None
        with zipfile.ZipFile(buffer) as zipped:
            data = zipped.read("WDIData.csv")
            data = read_csv(BytesIO(data))
        assert data is not None

        data = data.rename(
            columns={
                "Country Code": "3166-1-alpha-3",
                "Indicator Name": "indicator_name",
                "Indicator Code": "indicator_code",
            })

        data = data.merge(aux["worldbank_indicators"]).merge(
            aux["country_codes"])
        data = data.drop(
            columns=["Country Name", "3166-1-alpha-2", "3166-1-alpha-3"])

        indicators = parse_opts.get(
            "indicators", {code: code
                           for code in data.indicator_code.values})
        min_year = int(parse_opts.get("min_year", 2015))
        data = data[data.indicator_code.isin(indicators.values())]

        # Index data by indicator code for performance optimization
        keys = data.key.unique()
        indexed = {
            key: data[data.key == key].set_index("indicator_code")
            for key in keys
        }

        # There is probably a fancy pandas function to this more efficiently but this works for now
        map_func = partial(WorldbankDataSource._process_record, indexed,
                           indicators, min_year)
        records = thread_map(map_func, keys, desc="WorldBank Indicators")

        # Some countries are better described as subregions
        data = DataFrame.from_records(records)
        data.loc[data.key == "MF", "key"] = "FR_MF"

        # Return all records in DataFrame form
        return data
Пример #6
0
    v2_folder.mkdir(exist_ok=True, parents=True)

    # Download the v2 tables which can fit under 100MB
    for table_name in pbar(
        (
            "by-age",
            "by-sex",
            "demographics",
            "economy",
            "epidemiology",
            "geography",
            "health",
            "hospitalizations",
            "index",
            "mobility",
            "oxford-government-response",
            "weather",
            "worldbank",
            "worldpop",
        ),
            desc="V2 download",
    ):
        for ext in ("csv", "json"):
            with tempfile.NamedTemporaryFile() as tmp:
                tmp_path = Path(tmp.name)
                download(f"{URL_OUTPUTS_PROD}/{table_name}.{ext}", tmp)
                # Check that the output is less than 100 MB before copying it to the output folder
                if tmp_path.stat().st_size < 100 * 1000 * 1000:
                    shutil.copyfile(tmp_path,
                                    v2_folder / f"{table_name}.{ext}")
Пример #7
0
def perform_update(suite, paths):
    """
    Performs an incremental update and merge of a given suite
    """
    info('Checking for updates in %s' % suite)
    # print(paths)
    globalvars.suite = suite

    needsmerge = {}
    needsmerge['downloads'] = []  # all files that have to be downloaded
    regenrelease = False
    c = 0
    for i in repo_order:
        # i = repository name
        needsmerge[i] = {}
        needsmerge[i]['mergelist'] = []

        if paths[c]:
            info('Working on %s repo' % i)
            remote_path = paths[c].replace(spooldir, repos[i]['host'])
            try:
                remote_rel = requests.get(join(remote_path, 'Release'))
            except requests.exceptions.ConnectionError as err:
                warn('Caught exception: "%s". Retrying...' % err)
                return perform_update(suite, paths)

            local_rel_text = open(join(paths[c], 'Release')).read()

            diffs = {}
            if remote_is_newer(remote_rel.text, local_rel_text):
                download((join(remote_path,
                               'Release'), join(paths[c], 'Release')))
                regenrelease = True

                diffs = compare_dict(parse_release(remote_rel.text),
                                     parse_release(local_rel_text))
            if diffs:
                for k in diffs:
                    if k.endswith('Packages.gz') or k.endswith('Sources.gz'):
                        needsmerge[i]['mergelist'].append(k)
                    rmt = join(paths[c].replace(spooldir, repos[i]['host']), k)
                    loc = join(paths[c], k)
                    dlf = (rmt, loc)
                    needsmerge['downloads'].append(dlf)

        c += 1
        # break

    # download what needs to be downloaded
    if needsmerge['downloads']:
        info('Downloading updates...')
        dlpool = Pool(cpunm)
        dlpool.map(download, needsmerge['downloads'])

    # create union of our Packages.gz and Sources.gz files we will merge
    uni = []
    for i in repo_order:
        uni.append(needsmerge[i]['mergelist'])
    updpkg_list = set().union(*uni)

    # make a list of package lists to feed into merge()
    merge_list = []
    for i in updpkg_list:
        pkgs = []
        for j in repo_order:
            sui = suite
            # append valid aliases
            if repos[j]['aliases']:
                if suite in aliases[repos[j]['name']]:
                    sui = aliases[repos[j]['name']][suite]
                elif repos[j]['skipmissing']:
                    sui = None
                skips = ['jessie-security', 'ascii-security']  # hack
                if j == 'debian' and suite in skips:
                    sui = None

            if sui:
                pkgs.append(join(spooldir, repos[j]['dists'], sui, i))
            else:
                pkgs.append(None)

        merge_list.append(pkgs)

    # perform the actual merge
    if merge_list:
        info('Merging files...')
        mrgpool = Pool(cpunm)
        mrgpool.map(merge, merge_list)

    # generate Release files if we got any new files
    if needsmerge['downloads'] or regenrelease:
        info('Generating Release...')
        gen_release(suite)
Пример #8
0
    def parse_dataframes(self, dataframes: List[DataFrame],
                         aux: Dict[str, DataFrame], **parse_opts):

        # Get all the weather stations with data up until last month from inventory
        today = datetime.date.today()
        min_date = (today - datetime.timedelta(days=30)).strftime("%Y%m%d")
        stations = read_csv(_INVENTORY_URL).rename(columns={
            "LAT": "lat",
            "LON": "lon",
            "ELEV(M)": "elevation"
        })
        stations = stations[stations.END > int(min_date)]
        stations["id"] = stations["USAF"] + stations["WBAN"].apply(
            lambda x: f"{x:05d}")

        # Download all the station data as a compressed file
        buffer = BytesIO()
        records_url = "https://www.ncei.noaa.gov/data/global-summary-of-the-day/archive/2020.tar.gz"
        download(records_url, buffer, progress=True)
        buffer.seek(0)
        with tarfile.open(fileobj=buffer, mode="r:gz") as stations_tar:

            # Build the station cache by uncompressing all files in memory
            station_cache = {}
            for member in tqdm(stations_tar.getmembers(),
                               desc="Decompressing"):

                if not member.name.endswith(".csv"):
                    continue

                # Read the records from the provided station
                data = read_csv(stations_tar.extractfile(member)).rename(
                    columns=_COLUMN_MAPPING)

                # Fix data types
                data.noaa_station = data.noaa_station.astype(str)
                data.rainfall = data.rainfall.apply(
                    NoaaGsodDataSource.conv_dist)
                data.snowfall = data.snowfall.apply(
                    NoaaGsodDataSource.conv_dist)
                for temp_type in ("average", "minimum", "maximum"):
                    col = f"{temp_type}_temperature"
                    data[col] = data[col].apply(NoaaGsodDataSource.conv_temp)

                station_cache[member.name.replace(".csv", "")] = data

        # Get all the POI from metadata and go through each key
        metadata = dataframes[0][["key", "latitude", "longitude"]].dropna()

        # Convert all coordinates to radians
        stations["lat"] = stations.lat.apply(math.radians)
        stations["lon"] = stations.lon.apply(math.radians)
        metadata["lat"] = metadata.latitude.apply(math.radians)
        metadata["lon"] = metadata.longitude.apply(math.radians)

        # Make sure the stations and the cache are sent to each function call
        map_func = partial(NoaaGsodDataSource.process_location, station_cache,
                           stations)

        # We don't care about the index while iterating over each metadata item
        map_iter = [record for _, record in metadata.iterrows()]

        # Shuffle the iterables to try to make better use of the caching
        shuffle(map_iter)

        # Bottleneck is network so we can use lots of threads in parallel
        records = concurrent.thread_map(map_func,
                                        map_iter,
                                        total=len(metadata))

        return concat(records)
Пример #9
0
def main():

    # Create the folder which will be published
    public_folder = SRC / ".." / "output" / "public"
    public_folder.mkdir(exist_ok=True, parents=True)

    # Create the v1 data.csv file
    main_table = read_file(f"{URL_OUTPUTS_PROD}/main.csv", low_memory=False)
    data = main_table[main_table.aggregation_level < 2]
    rename_columns = {
        "date": "Date",
        "key": "Key",
        "country_code": "CountryCode",
        "country_name": "CountryName",
        "subregion1_code": "RegionCode",
        "subregion1_name": "RegionName",
        "total_confirmed": "Confirmed",
        "total_deceased": "Deaths",
        "latitude": "Latitude",
        "longitude": "Longitude",
        "population": "Population",
    }
    data = data[rename_columns.keys()].rename(columns=rename_columns)
    data = data.dropna(subset=["Confirmed", "Deaths"], how="all")
    data = data.sort_values(["Date", "Key"])
    export_csv(data, public_folder / "data.csv")

    # Create the v1 data_minimal.csv file
    export_csv(data[["Date", "Key", "Confirmed", "Deaths"]], public_folder / "data_minimal.csv")

    # Create the v1 data_latest.csv file
    latest = main_table[main_table.aggregation_level < 2]
    latest = latest.sort_values("date").groupby("key").last().reset_index()
    latest = latest[rename_columns.keys()].rename(columns=rename_columns)
    latest = latest.dropna(subset=["Confirmed", "Deaths"], how="all")
    latest = latest.sort_values(["Key", "Date"])
    export_csv(latest, public_folder / "data_latest.csv")

    # Create the v1 weather.csv file
    weather = read_file(f"{URL_OUTPUTS_PROD}/weather.csv")
    weather = weather[weather.key.apply(lambda x: len(x.split("_")) < 3)]
    weather = weather.rename(columns={"noaa_distance": "distance", "noaa_station": "station"})
    rename_columns = {col: snake_to_camel_case(col) for col in weather.columns}
    export_csv(weather.rename(columns=rename_columns), public_folder / "weather.csv")

    # Create the v1 mobility.csv file
    mobility = read_file(f"{URL_OUTPUTS_PROD}/mobility.csv")
    mobility = mobility[mobility.key.apply(lambda x: len(x.split("_")) < 3)]
    mobility = drop_na_records(mobility, ["date", "key"])
    rename_columns = {
        col: snake_to_camel_case(col).replace("Mobility", "") for col in mobility.columns
    }
    export_csv(mobility.rename(columns=rename_columns), public_folder / "mobility.csv")

    # Create the v1 CSV files which only require simple column mapping
    v1_v2_name_map = {"response": "oxford-government-response"}
    for v1_name, v2_name in v1_v2_name_map.items():
        data = read_file(f"{URL_OUTPUTS_PROD}/{v2_name}.csv")
        rename_columns = {col: snake_to_camel_case(col) for col in data.columns}
        export_csv(data.rename(columns=rename_columns), public_folder / f"{v1_name}.csv")

    # Create the v1 forecast.csv file
    export_csv(
        build_forecast(read_file(public_folder / "data_minimal.csv")),
        public_folder / "data_forecast.csv",
    )

    # Convert all v1 CSV files to JSON using record format
    for csv_file in pbar([*(public_folder).glob("*.csv")], desc="V1 JSON conversion"):
        data = read_file(csv_file, low_memory=False)
        json_path = str(csv_file).replace("csv", "json")
        data.to_json(json_path, orient="records")

    # Create the v2 folder
    v2_folder = public_folder / "v2"
    v2_folder.mkdir(exist_ok=True, parents=True)

    # Download the v2 tables which can fit under 100MB
    for table_name in pbar(
        (
            "by-age",
            "by-sex",
            "demographics",
            "economy",
            "epidemiology",
            "geography",
            "health",
            "hospitalizations",
            "index",
            "mobility",
            "oxford-government-response",
            "weather",
            "worldbank",
            "worldpop",
        ),
        desc="V2 download",
    ):
        for ext in ("csv", "json"):
            with tempfile.NamedTemporaryFile() as tmp:
                tmp_path = Path(tmp.name)
                download(f"{URL_OUTPUTS_PROD}/{table_name}.{ext}", tmp)
                # Check that the output is less than 100 MB before copying it to the output folder
                if tmp_path.stat().st_size < 100 * 1000 * 1000:
                    shutil.copyfile(tmp_path, v2_folder / f"{table_name}.{ext}")