def metadata_osm_length(config):

    metadata = get_data_from_athena(
        "select * from "
        f"{config['athena_database']}.{config['slug']}_metadata_regions_metadata ",
        config,
    )

    rerun = metadata[metadata["rerun"] == "TRUE"]

    if config["mode"] == "overwrite_partitions":

        try:
            skip = get_data_from_athena(
                "select distinct region_shapefile_wkt from "
                f"{config['athena_database']}.{config['slug']}_metadata_metadata_osm_length "
                "where osm_length is not null",
                config,
            )
        except:
            skip = pd.DataFrame([], columns=["region_shapefile_wkt"])

        metadata = metadata[
            ~metadata["region_shapefile_wkt"].isin(skip["region_shapefile_wkt"])
        ]

    def _get_length(x):

        lengths = osm_road_length.get(wkt.loads(x))
        lengths = lengths[lengths.index.isin(config["accepted_osm_keys"])]
        return lengths["length"].sum()

    metadata = pd.concat([metadata, rerun]).drop_duplicates()

    if config["verbose"]:
        print(list(metadata["region_slug"]))

    metadata["osm_length"] = metadata["region_shapefile_wkt"].apply(_get_length)

    if len(metadata):

        res = wr.s3.to_parquet(
            df=metadata,
            path="s3://{bucket}/{prefix}/{slug}/{raw_table}/{name}".format(**config),
            dataset=True,
            database=config["athena_database"],
            table="{slug}_{raw_table}_{name}".format(**config),
            mode=config["mode"],
            partition_cols=["region_slug"],
            boto3_session=boto3.Session(region_name="us-east-1"),
        )
Пример #2
0
def check_existence(config):

    res = get_data_from_athena(f"show tables in \
                    {config['athena_database']} '{config['slug']}_{config['raw_table']}_{config['name']}'"
                               )

    return len(res) > 0
Пример #3
0
def coarse(config):

    insert_groups = get_data_from_athena(
        'select distinct region_slug, "group" from '
        f"{config['athena_database']}.{config['slug']}_grid_resolutions "
        "where resolution = 7",
        config,
    ).to_dict("records")

    insert_into.start(config, insert_groups)
def _region_slug_partition(config):

    data = get_data_from_athena(
        "select * from "
        f"{config['athena_database']}.{config['slug']}_metadata_metadata_ready "
    ).to_dict('records')

    for d in data:
        d['partition'] = d['region_slug']

    return data
Пример #5
0
def resolutions(config):

    metadata = get_data_from_athena(
        "select region_slug, region_shapefile_wkt from "
        f"{config['athena_database']}.{config['slug']}_metadata_metadata_prepare "
        "where grid = 'TRUE' "
        f"""or region_slug in ('{"','".join(config['selected_regions'])}')""",
        config,
    )

    metadata["wkt"] = metadata["region_shapefile_wkt"].apply(_reescale)
    # metadata["geojson"] = metadata["wkt_reescaled"].apply(_wkt_to_geojson)

    grid = (
        metadata.groupby("region_slug")["wkt"].apply(
            lambda x: get_cells(x, config))  # Get h3 ids and wkts
        .reset_index())

    create_table.from_local(grid, config, wrangler=True)
def write_index(config):

    for table in config["to_write"]:

        df = get_data_from_athena(
            "select * from "
            f"{config['athena_database']}.{config['slug']}_{table['table']}"
        )

        if "region_shapefile_wkt" in df.columns:
            df["region_shapefile_wkt"] = df["region_shapefile_wkt"].apply(
                lambda x: str(simplify(wkt.loads(x)))
            )

        if table.get("overall_drop"):
            df = df.drop(table["overall_drop"], 1)

        # print(df.apply(lambda x: max(len(str))))

        drive_config = yaml.load(open("configs/drive-config.yaml", "r"))

        if config["slug"] == "dev":
            _write_sheets_table(
                df,
                table["worksheet"],
                config,
                drive_config[config["name"]][config["slug"]],
            )

        elif config["slug"] == "prod":

            _write_sheets_table(
                df,
                table["worksheet"],
                config,
                drive_config[config["name"]][config["slug"]],
            )

            df = df.drop(table["public_drop"], 1)
            _write_sheets_table(
                df, table["worksheet"], config, drive_config[config["name"]]["public"]
            )
Пример #7
0
def write_index(config):

    df = get_data_from_athena("select * from "
                              f"{config['athena_database']}.{config['slug']}_"
                              f"{config['raw_table']}_index")

    drive_config = yaml.load(open('configs/drive-config.yaml', 'r'))

    if config['slug'] == 'dev':
        _write_sheets_table(df, config,
                            drive_config[config['name']][config['slug']])

    elif config['slug'] == 'prod':

        _write_sheets_table(df, config,
                            drive_config[config['name']][config['slug']])

        drop_rows = [
            'observed', 'expected_2019', 'expected_2020', 'dashboard',
            'ratio_19'
        ]
        df = df.drop(drop_rows, 1)
        _write_sheets_table(df, config, drive_config[config['name']]['public'])
Пример #8
0
def _region_slug_partition(config):

    data = get_data_from_athena(
        "select * from "
        f"{config['athena_database']}.{config['slug']}_metadata_metadata_ready ",
        config,
    )

    rerun = data[data["rerun"] == "TRUE"]

    if config.get("if_exists") == "append":

        # check if table exists
        try:
            skip = get_data_from_athena(
                "select distinct region_shapefile_wkt from "
                f"{config['athena_database']}.{config['slug']}_analysis_metadata_variation "
                "where n_days is not null",
                config,
            )
        except:
            skip = pd.DataFrame([], columns=["region_shapefile_wkt"])

        data = data[~data["region_shapefile_wkt"].
                    isin(skip["region_shapefile_wkt"])]

        if config["name"] == "analysis_daily":
            data = data[~data["region_slug"].isin(config["cv_exception"])]

    if config.get("filter_by_coef"):

        skip = get_data_from_athena(
            "select region_slug from "
            f"{config['athena_database']}.{config['slug']}_analysis_metadata_variation "
            "where (weekly_approved = true or daily_approved = true) "
            f"""or (region_slug in ('{"','".join(config['cv_exception'])}')) """,
            config,
        )

        data = data[data["region_slug"].isin(skip["region_slug"])]

    if config.get("sample_cities"):

        data = data[:config["sample_cities"]]

    data = pd.concat([data, rerun]).drop_duplicates()

    data = data.to_dict("records")

    for d in data:
        d["partition"] = d["region_slug"]

        if config["name"] == "analysis_daily":
            if d["region_slug"] in config["sampled"]:
                d["dates"] = sample_query_weeks(
                    config["full_2019_interval"]["start"],
                    config["full_2019_interval"]["end"],
                )

            d["p_path"] = deepcopy(
                "country_iso={country_iso}/{partition}".format(**d))
        else:
            d["p_path"] = deepcopy("region_slug={partition}".format(**d))

    return data