示例#1
0
def main_get_data(paths, parallel: bool = False, n_jobs: int = -2, modules_name: list = modules_name,
                  skip_countries: list = [], gsheets_api=None):
    """Get data from sources and export to output folder.

    Is equivalent to script `run_python_scripts.py`
    """
    print("-- Getting data... --")
    skip_countries = [x.lower() for x in skip_countries]
    country_data_getter = CountryDataGetter(paths, skip_countries, gsheets_api)
    if parallel:
        modules_execution_results = Parallel(n_jobs=n_jobs, backend="threading")(
            delayed(country_data_getter.run)(
                module_name,
            ) for module_name in modules_name
        )
    else:
        modules_execution_results = []
        for module_name in modules_name:
            modules_execution_results.append(country_data_getter.run(
                module_name,
            ))

    modules_failed = [m["module_name"] for m in modules_execution_results if m["success"] is False]
    # Retry failed modules
    logger.info(f"\n---\n\nRETRIALS ({len(modules_failed)})")
    modules_execution_results = []
    for module_name in modules_failed:
        modules_execution_results.append(
            country_data_getter.run(module_name)
        )
    modules_failed_retrial = [m["module_name"] for m in modules_execution_results if m["success"] is False]
    if len(modules_failed_retrial) > 0:
        failed_str = "\n".join([f"* {m}" for m in modules_failed_retrial])
        print(f"\n---\n\nThe following scripts failed to run ({len(modules_failed_retrial)}):\n{failed_str}")
    print_eoe()
示例#2
0
def main_process_data(paths,
                      gsheets_api,
                      google_spreadsheet_vax_id: str,
                      skip_complete: list = None,
                      skip_monotonic: dict = {},
                      skip_anomaly: dict = {}):
    print("-- Processing data... --")
    # Get data from sheets
    logger.info("Getting data from Google Spreadsheet...")
    gsheet = VaccinationGSheet(gsheets_api, google_spreadsheet_vax_id)
    df_manual_list = gsheet.df_list()

    # Get automated-country data
    logger.info("Getting data from output...")
    automated = gsheet.automated_countries
    filepaths_auto = [paths.tmp_vax_out(country) for country in automated]
    df_auto_list = [pd.read_csv(filepath) for filepath in filepaths_auto]

    # Concatenate
    vax = df_manual_list + df_auto_list

    # Process locations
    def _process_location(df):
        monotonic_check_skip = skip_monotonic.get(df.loc[0, "location"], [])
        anomaly_check_skip = skip_anomaly.get(df.loc[0, "location"], [])
        return process_location(df, monotonic_check_skip, anomaly_check_skip)

    logger.info("Processing and exporting data...")
    vax_valid = []
    for df in vax:
        if "location" not in df:
            raise ValueError(f"Column `location` missing. df: {df.tail(5)}")
        country = df.loc[0, "location"]
        if country.lower() not in skip_complete:
            df = _process_location(df)
            vax_valid.append(df)
            # Export
            df.to_csv(paths.pub_vax_loc(country), index=False)
            logger.info(f"{country}: SUCCESS ✅")
        else:
            logger.info(f"{country}: SKIPPED 🚧")
    df = pd.concat(vax_valid).sort_values(by=["location", "date"])
    df.to_csv(paths.tmp_vax_all, index=False)
    gsheet.metadata.to_csv(paths.tmp_met_all, index=False)
    logger.info("Exported ✅")
    print_eoe()
示例#3
0
def main_propose_data_twitter(paths,
                              consumer_key: str,
                              consumer_secret: str,
                              parallel: bool = False,
                              n_jobs: int = -2):
    """Get data from Twitter and propose it."""
    print("-- Generating data proposals from Twitter sources... --")
    api = TwitterAPI(consumer_key, consumer_secret)
    if parallel:
        modules_execution_results = Parallel(
            n_jobs=n_jobs, backend="threading")(delayed(_propose_data_country)(
                api,
                module_name,
                paths,
            ) for module_name in modules_name)
    else:
        modules_execution_results = []
        for module_name in modules_name:
            modules_execution_results.append(
                _propose_data_country(
                    api,
                    module_name,
                    paths,
                ))

    modules_failed = [
        m["module_name"] for m in modules_execution_results
        if m["success"] is False
    ]
    # Retry failed modules
    logger.info(f"\n---\n\nRETRIALS ({len(modules_failed)})")
    modules_execution_results = []
    for module_name in modules_failed:
        modules_execution_results.append(
            _propose_data_country(api, module_name, paths))
    modules_failed_retrial = [
        m["module_name"] for m in modules_execution_results
        if m["success"] is False
    ]
    if len(modules_failed_retrial) > 0:
        failed_str = "\n".join([f"* {m}" for m in modules_failed_retrial])
        print(
            f"\n---\n\nThe following scripts failed to run ({len(modules_failed_retrial)}):\n{failed_str}"
        )
    print_eoe()
示例#4
0
def main_process_data(paths,
                      google_credentials: str,
                      google_spreadsheet_vax_id: str,
                      skip_complete: list = None,
                      skip_monotonic: dict = {},
                      skip_anomaly: dict = {}):
    print("-- Processing data... --")
    # Get data from sheets
    logger.info("Getting data from Google Spreadsheet...")
    gsheet = GSheet(google_credentials, google_spreadsheet_vax_id)
    df_manual_list = gsheet.df_list()

    # Get automated-country data
    logger.info("Getting data from output...")
    automated = gsheet.automated_countries
    filepaths_auto = [paths.tmp_vax_out(country) for country in automated]
    df_auto_list = [pd.read_csv(filepath) for filepath in filepaths_auto]

    # Concatenate
    vax = df_manual_list + df_auto_list

    # Process locations
    def _process_location(df):
        monotonic_check_skip = skip_monotonic.get(df.loc[0, "location"], [])
        anomaly_check_skip = skip_anomaly.get(df.loc[0, "location"], [])
        return process_location(df, monotonic_check_skip, anomaly_check_skip)

    logger.info("Processing and exporting data...")
    vax = [
        _process_location(df) for df in vax
        if df.loc[0, "location"].lower() not in skip_complete
    ]

    # Export
    for df in vax:
        country = df.loc[0, "location"]
        df.to_csv(paths.pub_vax_loc(country), index=False)
    df = pd.concat(vax).sort_values(by=["location", "date"])
    df.to_csv(paths.tmp_vax_all, index=False)
    gsheet.metadata.to_csv(paths.tmp_met_all, index=False)
    logger.info("Exported ✅")
    print_eoe()
示例#5
0
def main_get_data(paths,
                  parallel: bool = False,
                  n_jobs: int = -2,
                  modules_name: list = modules_name,
                  greece_api_token: str = None):
    """Get data from sources and export to output folder.

    Is equivalent to script `run_python_scripts.py`
    """
    print("-- Getting data... --")
    if parallel:
        modules_execution_results = Parallel(
            n_jobs=n_jobs,
            backend="threading")(delayed(_get_data_country)(module_name, paths,
                                                            greece_api_token)
                                 for module_name in modules_name)
    else:
        modules_execution_results = []
        for module_name in modules_name:
            modules_execution_results.append(
                _get_data_country(module_name, paths, greece_api_token))

    modules_failed = [
        m["module_name"] for m in modules_execution_results
        if m["success"] is False
    ]
    # Retry failed modules
    logger.info(f"\n---\n\nRETRIALS ({len(modules_failed)})")
    modules_failed_retrial = []
    for module_name in modules_failed:
        _get_data_country(module_name, paths, greece_api_token)
    if len(modules_failed_retrial) > 0:
        failed_str = "\n".join([f"* {m}" for m in modules_failed_retrial])
        logger.warn(
            f"\n---\n\nThe following scripts failed to run ({len(modules_failed_retrial)}):{failed_str}"
        )
    print_eoe()