def wikidata_property( prop: str, entities: List[str], query_template: str = _WD_QUERY, logger: ErrorLogger = ErrorLogger(), offset: int = 0, **tqdm_kwargs, ) -> Any: """ Query a single property from Wikidata, and return all entities which are part of the provided list which contain that property. Arguments: prop: Wikidata property, for example P1082 for population. entities: List of Wikidata identifiers to query the desired property. query: [Optional] SPARQL query used to retrieve `prop`. logger: [Optional] ErrorLogger instance to use for logging. offset: [Optional] Number of items to skip in the result set. Returns: Iterable[Tuple[str, Any]]: Iterable of <Wikidata ID, property value> """ # Time to wait before retry in case of failure wait_time = _INIT_WAIT_TIME # Build the query from template tpl = query_template + " LIMIT {limit} OFFSET {offset}" query = tpl.format(prop=prop, limit=_LIMIT, offset=offset) # Keep trying request until succeeds, or _max_retries is reached for i in range(_MAX_RETRIES): response = None try: start_time = time.monotonic() params = {"query": query, "format": "json"} req_opts = dict(headers=_REQUEST_HEADER, params=params, timeout=_WD_TIMEOUT) response = requests.get(_WD_URL, **req_opts) elapsed_time = time.monotonic() - start_time log_opts = dict(status=response.status_code, url=_WD_URL, time=elapsed_time, **params) logger.log_info(f"Wikidata SPARQL server response", **log_opts) data = response.json() # Return the first binding available (there should be only one) for item in pbar(data["results"]["bindings"], **tqdm_kwargs): pid = item["pid"]["value"].split("/")[-1] if pid in entities: yield pid, item["prop"]["value"] # Unless we got `_LIMIT` results, keep adding offset until we run our of results if len(data["results"]["bindings"]) == _LIMIT: yield from wikidata_property( prop, entities, query_template=query_template, logger=logger, offset=offset + _LIMIT, **tqdm_kwargs, ) # If no exceptions were thrown, we have reached the end logger.log_info(f"Wikidata SPARQL results end reached") return except Exception as exc: # If we have reached the error limit, log and re-raise the error if i + 1 >= _MAX_RETRIES: msg = response.text if response is not None else "Unknown error" logger.log_error(msg, exc=exc, traceback=traceback.format_exc()) raise exc # Use exponential backoff in case of error logger.log_info( f"({i + 1}) Request error. Retry in {wait_time} seconds...", exc=exc) time.sleep(wait_time) wait_time *= 2
def make_main_table( tables_folder: Path, output_path: Path, logger: ErrorLogger = ErrorLogger()) -> None: """ Build a flat view of all tables combined, joined by <key> or <key, date>. Arguments: tables_folder: Input folder where all CSV files exist. Returns: DataFrame: Flat table with all data combined. """ # Use a temporary directory for intermediate files with TemporaryDirectory() as workdir: workdir = Path(workdir) # Merge all output files into a single table keys_table_path = workdir / "keys.csv" keys_table = read_file(tables_folder / "index.csv", usecols=["key"]) export_csv(keys_table, keys_table_path) logger.log_info("Created keys table") # Add a date to each region from index to allow iterative left joins max_date = (datetime.datetime.now() + datetime.timedelta(days=1)).date().isoformat() date_list = date_range("2020-01-01", max_date) date_table_path = workdir / "dates.csv" export_csv(DataFrame(date_list, columns=["date"]), date_table_path) logger.log_info("Created dates table") # Create a temporary working table file which can be used during the steps temp_file_path = workdir / "main.tmp.csv" table_cross_product(keys_table_path, date_table_path, temp_file_path) logger.log_info("Created cross product table") # Add all the index columns to seed the main table main_table_path = workdir / "main.csv" table_join(temp_file_path, tables_folder / "index.csv", ["key"], main_table_path, how="outer") logger.log_info("Joined with table index") non_dated_columns = set(get_table_columns(main_table_path)) for table_file_path in pbar([*tables_folder.glob("*.csv")], desc="Make main table"): table_name = table_file_path.stem if table_name not in EXCLUDE_FROM_MAIN_TABLE: table_columns = get_table_columns(table_file_path) if "date" in table_columns: join_on = ["key", "date"] else: join_on = ["key"] # Keep track of columns which are not indexed by date non_dated_columns = non_dated_columns | set(table_columns) # Iteratively perform left outer joins on all tables table_join(main_table_path, table_file_path, join_on, temp_file_path, how="outer") shutil.move(temp_file_path, main_table_path) logger.log_info(f"Joined with table {table_name}") # Drop rows with null date or without a single dated record # TODO: figure out a memory-efficient way to do this # Ensure that the table is appropriately sorted ans write to output location table_sort(main_table_path, output_path) logger.log_info("Sorted main table")