示例#1
0
def wikidata_property(
    prop: str,
    entities: List[str],
    query_template: str = _WD_QUERY,
    logger: ErrorLogger = ErrorLogger(),
    offset: int = 0,
    **tqdm_kwargs,
) -> Any:
    """
    Query a single property from Wikidata, and return all entities which are part of the provided
    list which contain that property.

    Arguments:
        prop: Wikidata property, for example P1082 for population.
        entities: List of Wikidata identifiers to query the desired property.
        query: [Optional] SPARQL query used to retrieve `prop`.
        logger: [Optional] ErrorLogger instance to use for logging.
        offset: [Optional] Number of items to skip in the result set.
    Returns:
        Iterable[Tuple[str, Any]]: Iterable of <Wikidata ID, property value>
    """
    # Time to wait before retry in case of failure
    wait_time = _INIT_WAIT_TIME

    # Build the query from template
    tpl = query_template + " LIMIT {limit} OFFSET {offset}"
    query = tpl.format(prop=prop, limit=_LIMIT, offset=offset)

    # Keep trying request until succeeds, or _max_retries is reached
    for i in range(_MAX_RETRIES):
        response = None

        try:
            start_time = time.monotonic()
            params = {"query": query, "format": "json"}
            req_opts = dict(headers=_REQUEST_HEADER,
                            params=params,
                            timeout=_WD_TIMEOUT)
            response = requests.get(_WD_URL, **req_opts)
            elapsed_time = time.monotonic() - start_time
            log_opts = dict(status=response.status_code,
                            url=_WD_URL,
                            time=elapsed_time,
                            **params)
            logger.log_info(f"Wikidata SPARQL server response", **log_opts)
            data = response.json()

            # Return the first binding available (there should be only one)
            for item in pbar(data["results"]["bindings"], **tqdm_kwargs):
                pid = item["pid"]["value"].split("/")[-1]
                if pid in entities:
                    yield pid, item["prop"]["value"]

            # Unless we got `_LIMIT` results, keep adding offset until we run our of results
            if len(data["results"]["bindings"]) == _LIMIT:
                yield from wikidata_property(
                    prop,
                    entities,
                    query_template=query_template,
                    logger=logger,
                    offset=offset + _LIMIT,
                    **tqdm_kwargs,
                )

            # If no exceptions were thrown, we have reached the end
            logger.log_info(f"Wikidata SPARQL results end reached")
            return

        except Exception as exc:

            # If we have reached the error limit, log and re-raise the error
            if i + 1 >= _MAX_RETRIES:
                msg = response.text if response is not None else "Unknown error"
                logger.log_error(msg,
                                 exc=exc,
                                 traceback=traceback.format_exc())
                raise exc

            # Use exponential backoff in case of error
            logger.log_info(
                f"({i + 1}) Request error. Retry in {wait_time} seconds...",
                exc=exc)
            time.sleep(wait_time)
            wait_time *= 2
示例#2
0
def make_main_table(
    tables_folder: Path,
    output_path: Path,
    logger: ErrorLogger = ErrorLogger()) -> None:
    """
    Build a flat view of all tables combined, joined by <key> or <key, date>.

    Arguments:
        tables_folder: Input folder where all CSV files exist.
    Returns:
        DataFrame: Flat table with all data combined.
    """

    # Use a temporary directory for intermediate files
    with TemporaryDirectory() as workdir:
        workdir = Path(workdir)

        # Merge all output files into a single table
        keys_table_path = workdir / "keys.csv"
        keys_table = read_file(tables_folder / "index.csv", usecols=["key"])
        export_csv(keys_table, keys_table_path)
        logger.log_info("Created keys table")

        # Add a date to each region from index to allow iterative left joins
        max_date = (datetime.datetime.now() +
                    datetime.timedelta(days=1)).date().isoformat()
        date_list = date_range("2020-01-01", max_date)
        date_table_path = workdir / "dates.csv"
        export_csv(DataFrame(date_list, columns=["date"]), date_table_path)
        logger.log_info("Created dates table")

        # Create a temporary working table file which can be used during the steps
        temp_file_path = workdir / "main.tmp.csv"
        table_cross_product(keys_table_path, date_table_path, temp_file_path)
        logger.log_info("Created cross product table")

        # Add all the index columns to seed the main table
        main_table_path = workdir / "main.csv"
        table_join(temp_file_path,
                   tables_folder / "index.csv", ["key"],
                   main_table_path,
                   how="outer")
        logger.log_info("Joined with table index")

        non_dated_columns = set(get_table_columns(main_table_path))
        for table_file_path in pbar([*tables_folder.glob("*.csv")],
                                    desc="Make main table"):
            table_name = table_file_path.stem
            if table_name not in EXCLUDE_FROM_MAIN_TABLE:

                table_columns = get_table_columns(table_file_path)
                if "date" in table_columns:
                    join_on = ["key", "date"]
                else:
                    join_on = ["key"]

                    # Keep track of columns which are not indexed by date
                    non_dated_columns = non_dated_columns | set(table_columns)

                # Iteratively perform left outer joins on all tables
                table_join(main_table_path,
                           table_file_path,
                           join_on,
                           temp_file_path,
                           how="outer")
                shutil.move(temp_file_path, main_table_path)
                logger.log_info(f"Joined with table {table_name}")

        # Drop rows with null date or without a single dated record
        # TODO: figure out a memory-efficient way to do this

        # Ensure that the table is appropriately sorted ans write to output location
        table_sort(main_table_path, output_path)
        logger.log_info("Sorted main table")