def test_node_with_query_file(): query_str = "SELECT {value}" query_params = {"value": 1} test_file = "bigquery_node_test.sql" with open(test_file, "w") as f: f.write(query_str) with FeatureDAG(): a = BigQueryNode(name="query a", query_file=test_file, project="my-project") assert a._query == query_str with FeatureDAG(): a = BigQueryNode( name="query a", query_file=test_file, query_params=query_params, project="my-project", ) assert a._query == query_str.format(**query_params) os.remove(test_file)
def test_same_node_cant_be_added_to_dag_twice(): with FeatureDAG() as dag: a = FeatureNode(name="query") with pytest.raises(ValueError): dag.add_node(a)
def test_node_with_non_existent_file(): with FeatureDAG(): with pytest.raises(FileNotFoundError): _ = BigQueryNode(name="query a", query_file="does_not_exist.sql", project="my-project")
def test_node_project_specified_in_dag(): project = "my-project" with FeatureDAG(dag_params={"project": project}): a = BigQueryNode(name="query a", query="SELECT 1") assert a.project == project
def test_assign_node_to_self_fail(): with FeatureDAG(): a = FeatureNode(name="query a") with pytest.raises(ValueError): a >> a
def test_calc_cache_without_input_tables(): with FeatureDAG(dag_params={"project": "my-project"}): a = BigQueryNode(name="query a", query="SELECT 1") b = BigQueryNode(name="query b", query="SELECT 1") c = BigQueryNode(name="query c", query="SELECT 2") assert a._calc_current_cache_tag() == b._calc_current_cache_tag() assert a._calc_current_cache_tag() != c._calc_current_cache_tag()
def test_node_cant_be_assigned_twice(): with FeatureDAG(): a = FeatureNode(name="query a") b = FeatureNode(name="query b") a >> b with pytest.raises(ValueError): a >> b
def test_simple_dag(): with FeatureDAG(): a = FeatureNode(name="query a") b = FeatureNode(name="query b") a >> b assert b in a.children assert a in b.parents
def test_node_with_both_query_and_file(): with FeatureDAG(): with pytest.raises(ValueError): _ = BigQueryNode( name="query a", query="test", query_file="test.sql", project="my-project", )
def test_node_with_query_str(): query_str = "SELECT {value}" query_params = {"value": 1} with FeatureDAG(): a = BigQueryNode(name="query a", query=query_str, project="my-project") assert a._query == query_str with FeatureDAG(): a = BigQueryNode( name="query a", query=query_str, query_params=query_params, project="my-project", ) assert a._query == query_str.format(**query_params)
def test_assign_child_node_fail(): with FeatureDAG(): a = FeatureNode(name="query a") b = FeatureNode(name="query b") c = FeatureNode(name="query c") a >> b with pytest.raises(ValueError): # Only occurs when rrshift is used, which only happens when a list of nodes # assigns another node as a parent [c, b] >> a
def test_bracket_middle_dag(): with FeatureDAG(): a = FeatureNode(name="query a") b = FeatureNode(name="query b") c = FeatureNode(name="query c") d = FeatureNode(name="query d") a >> [b, c] >> d assert b in a.children assert c in a.children assert a in b.parents assert a in c.parents assert d in b.children assert d in c.children assert b in d.parents assert c in d.parents del a, b, c, d with FeatureDAG(): a = FeatureNode(name="query a") b = FeatureNode(name="query b") c = FeatureNode(name="query c") d = FeatureNode(name="query d") a >> b b >> c [c, a] >> d assert b in a.children assert c in b.children assert a in b.parents assert b in c.parents assert d in a.children assert d in c.children assert a in d.parents assert c in d.parents
def test_bracket_children_dag(): with FeatureDAG(): a = FeatureNode(name="query a") b = FeatureNode(name="query b") c = FeatureNode(name="query c") a >> [b, c] assert b in a.children assert c in a.children assert a in b.parents assert a in c.parents
def test_double_arrow_dag(): with FeatureDAG(): a = FeatureNode(name="query a") b = FeatureNode(name="query b") c = FeatureNode(name="query c") a >> b >> c assert b in a.children assert c in b.children assert a in b.parents assert b in c.parents
def test_bracket_parent_dag(): with FeatureDAG(): a = FeatureNode(name="query a") b = FeatureNode(name="query b") c = FeatureNode(name="query c") [a, b] >> c assert c in a.children assert c in b.children assert a in c.parents assert b in c.parents
def test_assign_parent_to_node_fail(): with FeatureDAG(): a = FeatureNode(name="query a") b = FeatureNode(name="query b") a >> b with pytest.raises(ValueError): b >> a c = FeatureNode(name="query c") b >> c with pytest.raises(ValueError): c >> a
def test_compact_state(): with FeatureDAG() as dag: a = FeatureNode(name="query a") dag.run_feature_graph() dag._nodes.remove(a) assert a.node_id in dag._state_dict.keys() dag.compact_state() assert a.node_id not in dag._state_dict.keys()
def test_clear_state(): with FeatureDAG() as dag: a = FeatureNode(name="query a") assert a._get_state_cache_tag is None dag.run_feature_graph() assert a._get_state_cache_tag is not None a.clear_state() assert a._get_state_cache_tag is None
def test_state_stored_with_file(): state_db = "./dag_state.sqlite" if os.path.exists(state_db): os.remove(state_db) with FeatureDAG(state_db=state_db) as dag: a = FeatureNode(name="query a") assert a.is_node_stale is True dag.run_feature_graph() assert a.is_node_stale is False del a, dag with FeatureDAG(state_db=state_db): a = FeatureNode(name="query a") assert a.is_node_stale is False os.remove(state_db)
def test_state_stored(): with FeatureDAG() as dag: a = FeatureNode(name="query a") assert a.is_node_stale is True # Mock the run function to ensure it's called a.run = Mock() dag.run_feature_graph() a.run.assert_called_once() assert a.is_node_stale is False # Re-run DAG and ensure node is no re-run a.run.reset_mock() dag.run_feature_graph() a.run.assert_not_called()
def test_node_project_not_specified(): with FeatureDAG(): with pytest.raises(LookupError): _ = BigQueryNode(name="query a", query="SELECT 1")
def test_two_nodes_cant_have_same_name(): with FeatureDAG(): _ = FeatureNode(name="query") with pytest.raises(ValueError): _ = FeatureNode(name="query")
def test_node_with_neither_query_or_file(): with FeatureDAG(): with pytest.raises(ValueError): _ = BigQueryNode(name="query a", project="my-project")
def test_calc_cache_with_input_tables(): client = bigquery.Client() mod_timestamp = datetime.now() with FeatureDAG(dag_params={"project": "my-project"}): client.get_table = MagicMock( project="my-project", dataset_id="my_fake_dataset", table_id="my_fake_table", modified=mod_timestamp, ) a = BigQueryNode( name="query a", query="SELECT 1", input_tables="my_fake_dataset.my_fake_table", client=client, ) node_a_catch_tag = a._calc_current_cache_tag() b = BigQueryNode( name="query b", query="SELECT 1", input_tables="my_fake_dataset.my_fake_table", client=client, ) node_b_catch_tag = b._calc_current_cache_tag() client.get_table = MagicMock( project="my-project", dataset_id="my_fake_dataset", table_id="my_other_fake_table", modified=mod_timestamp, ) c = BigQueryNode( name="query c", query="SELECT 2", input_tables="my_fake_dataset.my_other_fake_table", client=client, ) assert node_a_catch_tag == node_b_catch_tag assert node_a_catch_tag != c._calc_current_cache_tag() del a del b # Also test date changed with FeatureDAG(dag_params={"project": "my-project"}): client.get_table = MagicMock( project="my-project", dataset_id="my_fake_dataset", table_id="my_fake_table", modified=mod_timestamp, ) a = BigQueryNode( name="query a", query="SELECT 1", input_tables="my_fake_dataset.my_fake_table", client=client, ) node_a_catch_tag = a._calc_current_cache_tag() client.get_table = MagicMock( project="my-project", dataset_id="my_fake_dataset", table_id="my_fake_table", modified=datetime.now(), ) b = BigQueryNode( name="query b", query="SELECT 1", input_tables="my_fake_dataset.my_fake_table", client=client, ) assert node_a_catch_tag != b._calc_current_cache_tag()