def materialized_views_execute_sensor(context: SensorExecutionContext): """Sensor for executing materialized views based on cron expressions.""" # Setup Redis and Redlock r = Redis(constants.REDIS_HOST.value) lock = Redlock( key=constants.REDIS_KEY_MAT_VIEWS_MATERIALIZE_SENSOR_LOCK.value, auto_release_time=constants.REDIS_LOCK_AUTO_RELEASE_TIME.value, masters=[r]) if lock.acquire(timeout=2): lock.release() else: yield SkipReason("Another run is already in progress!") return rp = RedisPal(constants.REDIS_HOST.value) # Get managed materialized views managed_materialized_views: dict = rp.get("managed_materialized_views") if managed_materialized_views is None: managed_materialized_views = {} managed_materialized_views["views"] = {} # Get current timestamp now = datetime.datetime.now(pytz.timezone("America/Sao_Paulo")) # Iterate over all managed materialized views, storing a list # of all queries to be executed queries_to_execute: list = [] for blob_name, view_config in managed_materialized_views["views"].items(): if (view_config["last_run"] is None or determine_whether_to_execute_or_not( view_config["cron_expression"], now, view_config["last_run"])) and ( view_config["materialized"]): # Add to list of queries to execute queries_to_execute.append(blob_name) # Launch run if we have any queries to execute if queries_to_execute: # Get run configuration config: dict = read_config( Path(__file__).parent / "materialized_views_execute.yaml") # Get run key run_key = build_run_key("materialized_views_execute", now) # Set inputs config["solids"]["resolve_dependencies_and_execute"]["inputs"][ "queries_names"]["value"] = queries_to_execute yield RunRequest(run_key=run_key, run_config=config) # Tell Dagit a reason we skipped it else: yield SkipReason("No materialization requested for now")
def resolve_dependencies_and_execute(context, queries_names, materialization_locked: bool, materialization_lock): try: # Setup directed graph for DAG sorting graph = nx.DiGraph() # Get dependencies dependencies = {} rp = RedisPal(constants.REDIS_HOST.value) materialized_views: dict = rp.get( constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value) if materialized_views: for query_name in queries_names: if query_name in materialized_views[ "views"] and materialized_views["views"][query_name][ "materialized"]: graph.add_node(query_name) dependencies[query_name] = materialized_views["views"][ query_name]["depends_on"] else: context.log.warning( f"{query_name} not found on Redis! Skipping...") # Log dependencies context.log.info(f"Dependencies: {dependencies}") # Add edges to graph for query_name in queries_names: if query_name in dependencies: for dep in dependencies[query_name]: if dep in graph.nodes: graph.add_edge(dep, query_name) context.log.info(f"Graph: {graph.edges()}") # Get topological order order = list(nx.topological_sort(graph)) # Log topological order context.log.info(f"Order: {order}") # Execute queries in topological order for q in order: yield DynamicOutput(q, mapping_key=q.replace(".", "_")) except: locks.release(materialization_lock) raise
def example(): rp = RedisPal() key = "test" # Integers inp = 1 rp.set(key, inp) ans = rp.get(key) print("Inp is {} of type {}, ans is {} of type {}".format( inp, type(inp), ans, type(ans))) # Floating points inp = 1.23 rp.set(key, inp) ans = rp.get(key) print("Inp is {} of type {}, ans is {} of type {}".format( inp, type(inp), ans, type(ans))) # Strings inp = "Test" rp.set(key, inp) ans = rp.get(key) print("Inp is {} of type {}, ans is {} of type {}".format( inp, type(inp), ans, type(ans))) # Functions def echo(arg): return arg inp = echo rp.set(key, inp) ans = rp.get(key) print("Inp is {} of type {}, ans is {} of type {}".format( inp, type(inp), ans, type(ans))) # Numpy arrays import numpy as np inp = np.array([0, 1, 2, 3, 4]) rp.set(key, inp) ans = rp.get(key) print("Inp is {} of type {}, ans is {} of type {}".format( inp, type(inp), ans, type(ans)))
def delete_managed_views( context, blob_names, materialization_locked: bool, materialization_lock: Redlock, ): try: r = Redis(constants.REDIS_HOST.value) rp = RedisPal(constants.REDIS_HOST.value) lock = Redlock( key=constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS_LOCK.value, masters=[r], auto_release_time=constants.REDIS_LOCK_AUTO_RELEASE_TIME.value, ) with lock: materialized_views: dict = rp.get( constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value) if materialized_views is None: materialized_views = {} materialized_views["views"] = {} for blob_name in blob_names: context.log.info(f"Deleting managed view {blob_name}") if blob_name in materialized_views["views"]: del materialized_views["views"][blob_name] prefix: str = os.getenv("BQ_PROJECT_NAME", "rj-smtr-dev") table_name: str = f"{prefix}.{blob_name}" update_view(table_name, {}, "", "", "", delete=True) context.log.info("Success!") else: context.log.info("View not found, skipping...") rp.set(constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value, materialized_views) except Exception as e: try: materialization_lock.release() except: pass raise e
def get_configs_for_materialized_view(context, query_names: list, materialization_locked: bool, materialization_lock) -> dict: """Retrieves configs for materialized views""" try: for query_name in query_names: # Split query name into dataset_name and view_name dataset_name, view_name = query_name.split(".") # Load configs from GCS view_yaml = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, dataset_name, view_name)}.yaml' defaults_yaml = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, dataset_name)}/defaults.yaml' context.log.info(f"Defaults blob: {defaults_yaml}") context.log.info(f"View blob: {view_yaml}") defaults_blob = get_blob(defaults_yaml, SENSOR_BUCKET, mode="staging") view_blob = get_blob(view_yaml, SENSOR_BUCKET, mode="staging") if defaults_blob is None: raise Exception(f"Blob {defaults_yaml} not found!") defaults_dict = yaml.safe_load(defaults_blob.download_as_string()) if view_blob: view_dict = yaml.safe_load(view_blob.download_as_string()) else: context.log.warning( f"Blob {view_yaml} not found. This is not an error.") view_dict = {} # Merge configs query_params = {**defaults_dict, **view_dict} # Build base configs now = datetime.datetime.now(pytz.timezone("America/Sao_Paulo")) run_key = build_run_key(query_name, now) with open( str( Path(__file__).parent / "materialized_views_base_config.yaml"), "r") as f: base_params: dict = yaml.safe_load(f) base_params["run_timestamp"] = "'{}'".format( convert_datetime_to_datetime_string(now)) base_params["maestro_sha"] = "'{}'".format( fetch_branch_sha(constants.MAESTRO_REPOSITORY.value, constants.MAESTRO_DEFAULT_BRANCH.value)) base_params["maestro_bq_sha"] = "'{}'".format( fetch_branch_sha(constants.MAESTRO_BQ_REPOSITORY.value, constants.MAESTRO_BQ_DEFAULT_BRANCH.value)) base_params["run_key"] = "'{}'".format(run_key) # Few more params r = Redis(constants.REDIS_HOST.value) rp = RedisPal(constants.REDIS_HOST.value) lock = Redlock( key=constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS_LOCK.value, masters=[r], auto_release_time=constants.REDIS_LOCK_AUTO_RELEASE_TIME.value, ) table_name = parse_filepath_to_tablename(view_yaml) with lock: managed = rp.get( constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value) if managed is None: managed = {} managed["views"] = {} if query_name not in managed["views"]: raise Exception( f"Query {query_name} not found in managed views: {managed}" ) d = managed["views"][query_name] changed = d["query_modified"] context.log.info(f"{query_name} changed: {changed}") d["query_modified"] = False last_run = d["last_run"] d["last_run"] = now rp.set(constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value, managed) # Get query on GCS query_file = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, dataset_name, view_name)}.sql' query_blob = get_blob(query_file, SENSOR_BUCKET, mode="staging") if query_blob is None: raise Exception(f"Blob {query_file} not found!") base_query = query_blob.download_as_string().decode("utf-8") # Get parent queries on GCS parent_queries = {} for query_name in d["depends_on"]: if query_name in managed["views"] and managed["views"][ query_name]["materialized"]: continue query_file = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, "/".join(query_name.split(".")[:2]))}.sql' query_blob = get_blob(query_file, SENSOR_BUCKET, mode="staging") if query_blob is None: context.log.warning( f"Blob for parent query \"{query_file}\" not found, skipping..." ) continue parent_view_yaml = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, "/".join(query_name.split(".")[:2]))}.yaml' parent_view_blob = get_blob(parent_view_yaml, SENSOR_BUCKET, mode="staging") if parent_view_blob is not None: parent_view_dict = yaml.safe_load( parent_view_blob.download_as_string()) else: parent_view_dict = {} parent_defaults_yaml = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, "/".join(query_name.split(".")[:1]))}/defaults.yaml' parent_defaults_blob = get_blob(parent_defaults_yaml, SENSOR_BUCKET, mode="staging") if parent_defaults_blob is not None: parent_defaults_dict = yaml.safe_load( parent_defaults_blob.download_as_string()) else: context.log.warning( f"Blob for parent query \"{parent_defaults_yaml}\" not found, skipping..." ) continue parent_queries[query_name] = {} parent_queries[query_name][ "base_query"] = query_blob.download_as_string().decode( "utf-8") parent_queries[query_name]["query_params"] = { **parent_defaults_dict, **parent_view_dict } context.log.info(f"Parent queries: {parent_queries}") # Build configs # - table_name: str # - changed: bool # - base_query: str # - base_params: dict # - query_params: dict # - now: str # - last_run: str date_ranges = get_date_ranges( last_run if last_run else query_params["backfill"]["start_timestamp"], query_params["backfill"]["interval"], now) context.log.info(f"{date_ranges}") for i, _ in enumerate(date_ranges[:-1]): configs = { "table_name": table_name, "changed": changed if i == 0 else False, "base_query": base_query, "base_params": base_params, "query_params": query_params, "now": date_ranges[i + 1], "last_run": date_ranges[i], "parent_queries": parent_queries, } yield DynamicOutput( { "config_dict": configs, "materialization_lock": materialization_lock }, mapping_key= f'{configs["table_name"]}_{configs["last_run"]}_{configs["now"]}' .replace(".", "_").replace("-", "_").replace(" ", "_").replace(":", "_")) except Exception as e: try: locks.release(materialization_lock) except: pass raise e
def manage_view(context, input_dict): view_name = input_dict["view_name"] materialization_lock = input_dict["materialization_lock"] try: # Setup Redis and Redlock r = Redis(constants.REDIS_HOST.value) rp = RedisPal(constants.REDIS_HOST.value) lock = Redlock( key=constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS_LOCK.value, masters=[r], auto_release_time=constants.REDIS_LOCK_AUTO_RELEASE_TIME.value, ) # Get materialization information from Redis materialized_views: dict = rp.get( constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value) if materialized_views is None: materialized_views = {} materialized_views["views"] = {} materialized = materialized_views["views"][view_name]["materialized"] # If this is materialized, generate temp view if materialized: with lock: materialized_views["views"][view_name]["query_modified"] = True materialized_views["views"][view_name]["last_run"] = None rp.set(constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value, materialized_views) context.log.info( f"Generate {view_name} as a view for now, materialization comes later" ) # We need to build the query using # latest parameters and build a view with it. # Get defaults for view_name blob_path = os.path.join(*([MATERIALIZED_VIEWS_PREFIX] + [n for n in view_name.split(".")][:-1])) defaults_path = blob_path + "/defaults.yaml" context.log.info(f"Defaults path -> {defaults_path}") defaults_blob = get_blob(defaults_path, SENSOR_BUCKET, mode="staging") if defaults_blob is None: raise Exception(f"Blob {defaults_path} not found") defaults_dict: dict = yaml.safe_load( defaults_blob.download_as_string()) # Parse dataset_name dataset_name = view_name.split(".")[0] # Parse view yaml path view_yaml = f'{os.path.join(MATERIALIZED_VIEWS_PREFIX, view_name)}.yaml' # Parse table_name prefix: str = os.getenv("BQ_PROJECT_NAME", "rj-smtr-dev") table_name: str = f"{prefix}.{view_name}" context.log.info(f"Table name is {table_name}") # Update view update_view(table_name, defaults_dict, dataset_name, view_name.split(".")[-1], view_yaml, delete=False, context=context) except Exception as e: try: materialization_lock.release() except: pass raise e
def update_managed_views( context, blob_names, materialization_locked: bool, materialization_lock: Redlock, ): try: # Setup Redis and Redlock r = Redis(constants.REDIS_HOST.value) rp = RedisPal(constants.REDIS_HOST.value) views_lock = Redlock( key=constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS_LOCK.value, masters=[r], auto_release_time=constants.REDIS_LOCK_AUTO_RELEASE_TIME.value, ) # Initialize graph graph = nx.DiGraph() # If blob_name ends with "defaults.yaml", we need to # either add it to Redis or update its values and add # runs for every child it has and its dependencies. for blob_name in [ b for b in blob_names if b.endswith("defaults.yaml") ]: # Get dataset name blob_path = "/".join([n for n in blob_name.split("/") if n != ""][:-1]) dataset_name: str = blob_path.split("/")[-1] context.log.info("#" * 80) context.log.info(f"Updating {dataset_name} defaults") # Read the blob blob = get_blob(blob_name, SENSOR_BUCKET, mode="staging") if blob is None: raise Exception(f"Blob {blob_name} not found") blob_dict: dict = yaml.safe_load(blob.download_as_string()) # Add it to Redis with views_lock: materialized_views: dict = rp.get( constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value) if materialized_views is None: materialized_views = {} materialized_views["views"] = {} # Add every child to Redis if "views" not in blob_dict: raise Exception( f"Malformed blob (missing views key): {blob_name}") for key in blob_dict["views"].keys(): # Build key with dataset_name m_key = f"{dataset_name}.{key}" # This child also needs a run context.log.info(f"Adding {m_key} to runs") if m_key not in graph.nodes: graph.add_node(m_key) # Avoid KeyError if "views" not in materialized_views: materialized_views["views"] = {} # Add to Redis if m_key not in materialized_views["views"]: materialized_views["views"][m_key] = {} update_dict_with_dict( materialized_views["views"][m_key], { "cron_expression": blob_dict["scheduling"]["cron"], "last_run": None, "materialized": blob_dict["views"][key]["materialized"], "query_modified": True, "depends_on": blob_dict["views"][key]["depends_on"], }) # Adds dependencies to runs for dep in blob_dict["views"][key]["depends_on"]: context.log.info( f"Adding {dep} to runs as dependency of {m_key}") if dep not in graph.nodes: graph.add_node(dep) graph.add_edge(dep, m_key) # Try to find specific values for this view blob = get_blob(blob_path + key + ".yaml", SENSOR_BUCKET, mode="staging") if blob: # Replace values in Redis specific = yaml.safe_load( blob.download_as_string().decode("utf-8")) materialized_views["views"][m_key][ "cron_expression"] = specific["scheduling"]["cron"] else: context.log.warning( f"No specific values for {m_key} found. This is not an error." ) # Update Redis effectively rp.set(constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value, materialized_views) # Otherwise, we need to add the blob_name and its # dependencies to the graph. for blob_name in [ b for b in blob_names if not b.endswith("defaults.yaml") ]: # Get table name file_name = ".".join(blob_name.split("/")[-2:]) table_name = ".".join(file_name.split(".")[:-1]) context.log.info("#" * 80) context.log.info(f"Updating {table_name} specific values...") # If it's YAML file, update values on Redis if blob_name.endswith(".yaml"): # Read the blob blob = get_blob(blob_name, SENSOR_BUCKET, mode="staging") if blob is None: raise Exception(f"Blob {blob_name} not found") blob_dict: dict = yaml.safe_load(blob.download_as_string()) # Update Redis with views_lock: materialized_views: dict = rp.get( constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value) if materialized_views is None: materialized_views = {} materialized_views["views"] = {} if table_name not in materialized_views["views"]: materialized_views["views"][table_name] = {} update_dict_with_dict( materialized_views["views"][table_name], { "cron_expression": blob_dict["scheduling"]["cron"], "last_run": None, "query_modified": True, }) rp.set(constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value, materialized_views) # Add table_name and its dependencies to runs context.log.info(f"Adding {table_name} to runs") if table_name not in graph.nodes: graph.add_node(table_name) materialized_views: dict = rp.get( constants.REDIS_KEY_MAT_VIEWS_MANAGED_VIEWS.value) if materialized_views is None: materialized_views = {} materialized_views["views"] = {} if table_name in materialized_views["views"]: for dep in materialized_views["views"][table_name][ "depends_on"]: context.log.info( f"Adding {dep} to runs as dependency of {table_name}") if dep not in graph.nodes: graph.add_node(dep) graph.add_edge(dep, table_name) context.log.info(f"Graph edges: {graph.edges()}") # Get topological order order = list(nx.topological_sort(graph)) # Filter out views that are not on materialized_views["views"] order = [o for o in order if o in materialized_views["views"]] # Log topological order context.log.info(f"Order: {order}") # Execute queries in topological order for q in order: yield DynamicOutput( { "view_name": q, "materialization_lock": materialization_lock }, mapping_key=q.replace(".", "_")) except Exception as e: try: materialization_lock.release() except: pass raise e
def redis_keepalive_on_failure(context: HookContext): rp = RedisPal(host=constants.REDIS_HOST.value) rp.set(context.resources.keepalive_key["key"], 1) message = f"Although solid {context.solid.name} has failed, a keep-alive was sent to Redis!" url = context.resources.discord_webhook["url"] requests.post(url, data={"content": message})
def redis_keepalive_on_succes(context: HookContext): rp = RedisPal(host=constants.REDIS_HOST.value) rp.set(context.resources.keepalive_key["key"], 1)
def materialized_views_update_sensor(context: SensorExecutionContext): """Sensor for updating materialized views on file changes. For every new or modified file, the pipeline `update_managed_materialized_views` is triggered. This ensures BQ materialized views are always up-to-date. """ # Store largest mtime largest_mtime = 0 # Store deleted and modified blobs deleted_blobs = [] modified_blobs = [] # Get connection to Redis rp = RedisPal(host=constants.REDIS_HOST.value) # Get list of blobs in bucket blobs_list = get_list_of_blobs(MATERIALIZED_VIEWS_PREFIX, SENSOR_BUCKET) # Get previous set of blobs in Redis previous_blobs_set: set = rp.get( constants.REDIS_KEY_MAT_VIEWS_BLOBS_SET.value) # If there is no previous set, create it if not previous_blobs_set: rp.set(constants.REDIS_KEY_MAT_VIEWS_BLOBS_SET.value, set([b.name for b in blobs_list])) # If there is a previous set, compare it to the current set else: deleted_blobs: set = previous_blobs_set - \ set([b.name for b in blobs_list]) # Get previous run mtime previous_run_mtime = rp.get( constants.REDIS_KEY_MAT_VIEWS_LAST_RUN_MTIME.value) # If there is no previous run mtime, set modified blobs to the current blobs if not previous_run_mtime: modified_blobs = blobs_list # If there is a previous run mtime, compare it to the current list # and get modified files else: modified_blobs = filter_blobs_by_mtime(blobs_list, previous_run_mtime) # Update last run time largest_mtime = get_largest_blob_mtime(blobs_list) rp.set(constants.REDIS_KEY_MAT_VIEWS_LAST_RUN_MTIME.value, largest_mtime) # If there are modified or deleted files, trigger pipeline if modified_blobs or deleted_blobs: # Load run configuration and set inputs config: dict = read_config( Path(__file__).parent / "materialized_views_update.yaml") config["solids"]["delete_managed_views"]["inputs"]["blob_names"][ "value"] = list(deleted_blobs) config["solids"]["update_managed_views"]["inputs"]["blob_names"][ "value"] = [b.name for b in modified_blobs] # Set a run key run_key: str = build_run_key("update-managed-views", largest_mtime) # Yield a run request yield RunRequest(run_key=run_key, run_config=config) # If there are no modified or deleted files, # skip the pipeline else: yield SkipReason( f"Modified files: {len(modified_blobs)}. Deleted files: {len(deleted_blobs)}" )