def set_computed_values(cls, values: dict) -> dict: """ Compute values for data_id, last_cached_at, and sort_value if they are not already specified. """ data_id = cls.get_by_name_or_alias(values, "data_id") source_id = cls.get_by_name_or_alias(values, "source_id") if not data_id: m = md5() m.update(source_id.encode()) m.update(cls.get_by_name_or_alias(values, "path").encode()) data_id = m.hexdigest() values["dataId"] = data_id if not cls.get_by_name_or_alias( values, "last_cached_at") and fs.data_exists(data_id): values["lastCachedAt"] = fs.get_file_last_modified( fs.data_path(data_id), format="unix_milliseconds") if not cls.get_by_name_or_alias(values, "sort_value"): existing_nodes = SOURCES[source_id].nodes.values() sort_value = max((0, *(x.sort_value for x in existing_nodes))) + 1 values["sortValue"] = sort_value return values
async def get_data(self, ignore_cache=False) -> pd.DataFrame: """ Load the data for this node, also adding it to the cache. """ if fs.data_exists(self.data_id) and not ignore_cache: return fs.read_data(self.data_id) else: if inspect.iscoroutinefunction(self.source._get_data): data = await self.source._get_data(self.path) else: data = self.source._get_data(self.path) fs.save_data(self.data_id, data) self.last_cached_at = fs.get_file_last_modified( fs.data_path(self.data_id), format="unix_milliseconds") return data
async def build_profile_report(self) -> None: """ Build a pandas profile report. This is done in a separate process because it can be quite slow. """ try: if not fs.profile_report_exists(self.data_id): await self.get_data() await execute_profile_report_builder( data_path=fs.data_path(self.data_id), output_path=fs.profile_report_path(self.data_id), title=f"{self.source.name} - {self.path}", ) if not fs.profile_report_exists(self.data_id): raise Exception( "The profile report failed to build for some reason") except Exception as e: raise HTTPException(status_code=500, detail=str(e))