예제 #1
0
 def test_valid(self, simple_tuple_node_list):
     nodes = [node(*tup) for tup in simple_tuple_node_list]
     assert len(nodes) == len(simple_tuple_node_list)
예제 #2
0
def test_bad_node(func, expected):
    with pytest.raises(ValueError, match=expected):
        node(*func())
예제 #3
0
 def test_tag_nodes(self):
     tagged_node = node(identity, "input", "output",
                        tags=["hello"]).tag(["world"])
     assert "hello" in tagged_node.tags
     assert "world" in tagged_node.tags
     assert len(tagged_node.tags) == 2
예제 #4
0
 def test_node_less_than(self):
     first = node(identity, "input1", "output1", name="A")
     second = node(identity, "input1", "output1", name="B")
     assert first < second
     assert first is not second
예제 #5
0
 def test_different_input_list_order_not_equal(self):
     first = node(biconcat, ["input1", "input2"], "output1", name="A")
     second = node(biconcat, ["input2", "input1"], "output1", name="A")
     assert first != second
예제 #6
0
 def test_inputs_none(self):
     dummy_node = node(constant_output, None, "output")
     assert dummy_node.inputs == []
예제 #7
0
 def test_outputs_str(self):
     dummy_node = node(identity, "input1", "output1")
     assert dummy_node.outputs == ["output1"]
예제 #8
0
def saving_none_pipeline():
    return Pipeline([
        node(random, None, "A"),
        node(null, "A", "B"),
        node(identity, "B", "C")
    ])
예제 #9
0
def branchless_pipeline():
    return Pipeline([
        node(identity, "ds1", "ds2", name="node1"),
        node(identity, "ds2", "ds3", name="node2"),
    ])
def create_pipeline(**kwargs):

    # conn start
    start_conn = [
        node(hktvmall_conn_node,
             inputs="params:hktvmall_home_url",
             outputs="HktvmallHeader",
             tags="Preparation")
    ]

    # get HKTV mall categories
    get_category = [
        node(request_hktvmall_catagory_code,
             inputs=["HktvmallHeader", "params:hktvmall_category_diction_url"],
             outputs="category_raw_req",
             tags="Preparation"),
        node(categories_df_etl,
             inputs="category_raw_req",
             outputs="category_df",
             tags="Preparation")
    ]

    # generate urls by type for requests
    gen_url_list = [
        node(gen_hktvmall_product_by_method_and_cat_links,
             inputs=[
                 'params:hktvmall_catagory_code',
                 'params:hktvmall_browse_method',
                 "params:product_by_method_catcode_url"
             ],
             outputs=dict(method1="promotiondiff_url_list",
                          method2="hotpickorder_url_list"),
             tags="Preparation"),
        node(gen_hktvmall_full_site_links,
             inputs=["category_df", 'params:hktvmall_cat_product_url'],
             outputs="fullsite_url_list",
             tags="Preparation")
    ]

    # multi threading requires for raw data
    req_raw_df = [
        node(multi_threading_req,
             inputs=["HktvmallHeader", "promotiondiff_url_list"],
             outputs="promotiondiff_raw_list",
             tags="Requests"),
        node(multi_threading_req,
             inputs=["HktvmallHeader", "hotpickorder_url_list"],
             outputs="hotpickorder_raw_list",
             tags="Requests"),
        node(multi_threading_req,
             inputs=["HktvmallHeader", 'fullsite_url_list'],
             outputs="fullsite_raw_list",
             tags="Requests")
    ]

    # ETL on df columns for proper columns
    etl_on_df = [
        node(raw_etl,
             inputs="promotiondiff_raw_list",
             outputs="promotiondiff_raw_df",
             tags="ETL"),
        node(raw_etl,
             inputs="hotpickorder_raw_list",
             outputs="hotpickorder_raw_df",
             tags="ETL"),
        node(raw_etl,
             inputs="fullsite_raw_list",
             outputs="fullsite_raw_df",
             tags="ETL"),
    ]

    # turn df to CSVDataSet
    df_to_csv = [
        node(df_to_kedro_csvdataset,
             inputs=["category_df", "params:category_path"],
             outputs="category_raw",
             tags="Saving Data"),
        node(df_to_kedro_csvdataset,
             inputs=["promotiondiff_raw_df", "params:promotiondiff_path"],
             outputs="promotiondiff_raw",
             tags="Saving Data"),
        node(df_to_kedro_csvdataset,
             inputs=["hotpickorder_raw_df", "params:hotpickorder_path"],
             outputs="hotpickorder_raw",
             tags="Saving Data"),
        node(df_to_kedro_csvdataset,
             inputs=["fullsite_raw_df", "params:fullsite_path"],
             outputs="fullsite_raw",
             tags="Saving Data")
    ]

    pipe = start_conn + get_category + gen_url_list + req_raw_df + etl_on_df + df_to_csv

    # pipe = start_conn + get_category
    # pipe.append(
    #     node(
    #         gen_hktvmall_full_site_links,
    #         inputs=["category_df", 'params:hktvmall_cat_product_url'],
    #         outputs="fullsite_url_list",
    #         tags="Preparation"
    #     )
    # )
    # pipe.append(
    #     node(
    #         multi_threading_req,
    #         inputs=["HktvmallHeader", 'fullsite_url_list'],
    #         outputs="fullsite_raw_list",
    #         tags="Requests"
    #     )
    # )
    # pipe.append(
    #     node(
    #         raw_etl,
    #         inputs="fullsite_raw_list",
    #         outputs="fullsite_raw_df",
    #         tags="ETL"
    #     ),
    # )
    # pipe.append(
    #     node(
    #         df_to_kedro_csvdataset,
    #         inputs=["fullsite_raw_df", "params:fullsite_path"],
    #         outputs="fullsite_raw",
    #         tags="Saving Data"
    #     )
    # )

    return Pipeline(pipe)
예제 #11
0
def create_pipeline(**kwargs):
    return Pipeline([
        node(make_prediction, ["test_x", "rf_model"], "predict"),
        node(rmse_cv, ["test_x", "test_y", "rf_model"], "quality")
    ])
예제 #12
0
def _create_pipelines():
    bad_pipeline_middle = Pipeline(
        [
            node(identity, "cars", "boats", name="node1", tags=["tag1"]),
            node(identity, "boats", "trains", name="node2"),
            node(bad_node, "trains", "ships", name="nodes3"),
            node(identity, "ships", "planes", name="node4"),
        ],
        tags="bad_pipeline",
    )
    bad_pipeline_head = Pipeline(
        [
            node(bad_node, "cars", "boats", name="node1", tags=["tag1"]),
            node(identity, "boats", "trains", name="node2"),
            node(identity, "trains", "ships", name="nodes3"),
            node(identity, "ships", "planes", name="node4"),
        ],
        tags="bad_pipeline",
    )
    default_pipeline = Pipeline(
        [
            node(identity, "cars", "boats", name="node1", tags=["tag1"]),
            node(identity, "boats", "trains", name="node2"),
            node(identity, "trains", "ships", name="node3"),
            node(identity, "ships", "planes", name="node4"),
        ],
        tags="pipeline",
    )
    return {
        "__default__": default_pipeline,
        "empty": Pipeline([]),
        "simple": Pipeline([node(identity, "cars", "boats")]),
        "bad_pipeline_middle": bad_pipeline_middle,
        "bad_pipeline_head": bad_pipeline_head,
    }
예제 #13
0
 def test_task_exception(self, fan_out_fan_in, catalog):
     catalog.add_feed_dict(feed_dict=dict(A=42))
     pipeline = Pipeline([fan_out_fan_in, node(exception_fn, "Z", "X")])
     with pytest.raises(Exception, match="test exception"):
         ThreadRunner().run(pipeline, catalog)
예제 #14
0
 def test_labelled(self):
     assert "labeled_node: <lambda>([input1]) -> [output1]" in str(
         node(lambda x: None, "input1", "output1", name="labeled_node"))
예제 #15
0
 def test_no_input(self):
     assert "constant_output(None) -> [output1]" in str(
         node(constant_output, None, "output1"))
예제 #16
0
def saving_result_pipeline():
    return Pipeline([node(identity, "ds", "dsX")])
예제 #17
0
 def test_no_output(self):
     assert "<lambda>([input1]) -> None" in str(
         node(lambda x: None, "input1", None))
예제 #18
0
@pytest.fixture
def dummy_dataframe():
    return pd.DataFrame({"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]})


def identity(input1: str):
    return input1  # pragma: no cover


def bad_node(x):
    raise ValueError("Oh no!")


bad_pipeline_middle = Pipeline(
    [
        node(identity, "cars", "boats", name="node1", tags=["tag1"]),
        node(identity, "boats", "trains", name="node2"),
        node(bad_node, "trains", "ships", name="nodes3"),
        node(identity, "ships", "planes", name="node4"),
    ],
    tags="bad_pipeline",
)

expected_message_middle = (
    "There are 2 nodes that have not run.\n"
    "You can resume the pipeline run by adding the following "
    "argument to your previous command:\n"
    '  --from-nodes "nodes3"')

bad_pipeline_head = Pipeline(
    [
예제 #19
0
 def test_outputs_none(self):
     dummy_node = node(identity, "input", None)
     assert dummy_node.outputs == []
예제 #20
0
 def _get_pipelines(self) -> Dict[str, Pipeline]:
     return {
         "__default__": Pipeline([node(identity, "cars", "boats")])
     }
예제 #21
0
 def test_node_equals(self):
     first = node(identity, "input1", "output1", name="a node")
     second = node(identity, "input1", "output1", name="a node")
     assert first == second
     assert first is not second
예제 #22
0
 def test_spark_pickle(self, is_async, data_catalog):
     """SparkDataSet(load) -> node -> PickleDataSet (save)"""
     pipeline = Pipeline([node(identity, "spark_in", "pickle_ds")])
     pattern = ".* was not serialized due to.*"
     with pytest.raises(DataSetError, match=pattern):
         SequentialRunner(is_async=is_async).run(pipeline, data_catalog)
예제 #23
0
 def test_node_invalid_equals(self):
     n = node(identity, "input1", "output1", name="a node")
     assert n != "hello"
def create_pipeline(**kwargs):
    return Pipeline([
        node(
            func=tidy_data,
            inputs="filt_list",
            outputs="data",
        ),
        node(
            func=buscar_numero_legajo,
            inputs="data",
            outputs="numero_legajo",
        ),
        node(
            func=buscar_violencia_genero,
            inputs="data",
            outputs="violencia_de_genero",
        ),
        node(
            func=buscar_violencia_fisica,
            inputs="data",
            outputs="violencia_fisica",
        ),
        node(
            func=buscar_info_denunciante,
            inputs="data",
            outputs="info_denunciante",
        ),
        node(
            func=buscar_info_acusado,
            inputs="data",
            outputs="info_acusado",
        ),
        node(
            func=buscar_genero_den,
            inputs="info_denunciante",
            outputs="genero_denunciante",
        ),
        node(
            func=buscar_nacionalidad_denunciante,
            inputs="info_denunciante",
            outputs="nacionalidad_denunciante",
        ),
        node(
            func=buscar_est_civil_denunciente,
            inputs="info_denunciante",
            outputs="est_civil_denunciante",
        ),
        node(
            func=buscar_edad_denunciante,
            inputs="info_denunciante",
            outputs="edad_denunciante",
        ),
        node(
            func=buscar_est_denunciante,
            inputs="info_denunciante",
            outputs="est_denunciante",
        ),
        node(
            func=buscar_domic_denunciante,
            inputs="info_denunciante",
            outputs="domic_denunciante",
        ),
        node(
            func=buscar_villa_denunciante,
            inputs="info_denunciante",
            outputs="villa_denunciante",
        ),
        node(
            func=buscar_ocupac_denunciante,
            inputs="info_denunciante",
            outputs="ocupac_denunciante",
        ),
        node(
            func=buscar_genero_acusado,
            inputs="info_acusado",
            outputs="genero_acusado",
        ),
        node(
            func=buscar_nacionalidad_acusado,
            inputs="info_acusado",
            outputs="nacionalida_acusado",
        ),
        node(
            func=buscar_est_civil_acusado,
            inputs="info_acusado",
            outputs="est_civil_acusado",
        ),
        node(
            func=buscar_edad_acusado,
            inputs="info_acusado",
            outputs="edad_acusado",
        ),
        node(
            func=buscar_instruccion_acusado,
            inputs="info_acusado",
            outputs="instruccion_acusado",
        ),
        node(
            func=buscar_domicilio_acusado,
            inputs="info_acusado",
            outputs="domicilio_acusado",
        ),
        node(
            func=buscar_ocupacion_acusado,
            inputs="info_acusado",
            outputs="ocupacion_acusado",
        ),
        node(
            func=buscar_relacion,
            inputs="info_acusado",
            outputs="relacion",
        ),
        node(
            func=chequear_conv,
            inputs="info_acusado",
            outputs="convivencia",
        ),
        node(
            func=buscar_info_episodio,
            inputs="data",
            outputs="info_episodio",
        ),
        node(
            func=buscar_denuncia_anterior,
            inputs="info_episodio",
            outputs="denuncia_anterior",
        ),
        node(
            func=buscar_medidas_prot,
            inputs="info_episodio",
            outputs="medidas_prot",
        ),
        node(
            func=buscar_dia_hecho,
            inputs="data",
            outputs="dia_hecho",
        ),
        node(
            func=buscar_conclusiones,
            inputs="data",
            outputs="conclusiones",
        ),
        node(
            func=buscar_riesgo,
            inputs="conclusiones",
            outputs="riesgo",
        ),
        node(
            func=buscar_informe_final,
            inputs="data",
            outputs="informe_final",
        ),
        node(
            func=buscar_violencia_amb,
            inputs="informe_final",
            outputs="violencia_amb",
        ),
        node(
            func=buscar_violencia_econ,
            inputs="informe_final",
            outputs="violencia_econ",
        ),
        node(
            func=buscar_violencia_genero,
            inputs="informe_final",
            outputs="violencia_genero",
        ),
        node(
            func=buscar_violencia_psico,
            inputs="informe_final",
            outputs="violencia_psico",
        ),
        node(
            func=buscar_violencia_sex,
            inputs="informe_final",
            outputs="violencia_sex",
        ),
        node(
            func=buscar_violencia_simb,
            inputs="informe_final",
            outputs="violencia_simb",
        ),
        node(
            func=buscar_violencia_soc,
            inputs="informe_final",
            outputs="violencia_soc",
        ),
        node(
            func=buscar_hijos,
            inputs="informe_final",
            outputs="hijos",
        ),
        node(
            func=buscar_hijos_en_comun,
            inputs="hijos",
            outputs="hijos_en_comun",
        ),
        node(
            func=buscar_frecuencia,
            inputs="informe_final",
            outputs="frecuencia",
        ),
        node(
            func=buscar_dijo,
            inputs="data",
            outputs="frases_agresion",
        ),
        node(
            func=buscar_dijo_sin_comillas,
            inputs="data",
            outputs="frases_sin_comillas",
        ),
        node(
            func=buscar_comillas,
            inputs="data",
            outputs="frases_comillas",
        ),
        node(
            func=buscar_fecha_del_hecho,
            inputs=["info_episodio", "fecha_denuncia"],
            outputs="fecha_del_hecho",
        ),
        node(
            func=buscar_horario_hecho,
            inputs="info_episodio",
            outputs="horario_hecho",
        ),
        node(
            func=buscar_fecha_denuncia,
            inputs="data",
            outputs="fecha_denuncia",
        ),
        node(
            func=to_excel,
            inputs=[
                "violencia_de_genero",
                "violencia_fisica",
                "genero_denunciante",
                "nacionalidad_denunciante",
                "est_civil_denunciante",
                "edad_denunciante",
                "est_denunciante",
                "domic_denunciante",
                "villa_denunciante",
                "ocupac_denunciante",
                "genero_acusado",
                "nacionalida_acusado",
                "est_civil_acusado",
                "edad_acusado",
                "instruccion_acusado",
                "domicilio_acusado",
                "ocupacion_acusado",
                "relacion",
                "convivencia",
                "denuncia_anterior",
                "medidas_prot",
                "dia_hecho",
                "riesgo",
                "violencia_psico",
                "violencia_econ",
                "violencia_sex",
                "violencia_soc",
                "violencia_amb",
                "violencia_simb",
                "hijos_en_comun",
                "frecuencia",
                "frases_sin_comillas",
                "frases_agresion",
                "frases_comillas",
                "fecha_del_hecho",
                "fecha_denuncia",
                "horario_hecho",
                "numero_legajo",
            ],
            outputs="excel_ovd_data",
        ),
    ])
예제 #25
0
 def test_different_output_list_order_not_equal(self):
     first = node(identity, "input1", ["output1", "output2"], name="A")
     second = node(identity, "input1", ["output2", "output1"], name="A")
     assert first != second
예제 #26
0
"""Contents of hello_kedro.py"""
from kedro.io import DataCatalog, MemoryDataSet
from kedro.pipeline import node, Pipeline
from kedro.runner import SequentialRunner

# Prepare a data catalog
data_catalog = DataCatalog({"example_data": MemoryDataSet()})


# Prepare second node
def join_statements(greeting):
    return f"{greeting} Kedro!"


join_statements_node = node(join_statements,
                            inputs="my_salutation",
                            outputs="my_message")


# Prepare first node
def return_greeting():
    return "Bonjourno"


return_greeting_node = node(return_greeting,
                            inputs=None,
                            outputs="my_salutation")

# Assemble nodes into a pipeline
pipeline = Pipeline([join_statements_node, return_greeting_node])
예제 #27
0
def test_bad_input(func, expected):
    with pytest.raises(TypeError, match=expected):
        node(*func())
예제 #28
0
def create_pipeline(**kwargs):
    return Pipeline(
        [
            #node(
            #    func=split_train_pool,
            #    inputs=dict(
            #        y_train_full="y_train_full",
            #        n_init="params:N_INIT"
            #        ),
            #    outputs=["full_id", "train_id", "pool_id"],
            #    tags=["sampling"]
            #),
            node(
                func=truncate_dataset,
                inputs=dict(
                    X_train_full="X_train_full",
                    y_train_full="y_train_full",
                    size="params:SIZE_ANALYSIS"
                    ),
                outputs=["X_train_trunc", "y_train_trunc"],
                tags=["pre_sampling"]
            ),
            node(
                func=compute_gaussian_kernel,
                inputs=dict(
                    X="X_train_trunc"
                    ),
                outputs="K_FIXE",
                tags=["pre_sampling"]
            ),
            node(
                func=al_performances,
                inputs=dict(
                    bs="params:BATCH_SEQ",
                    budget="params:BUDGET",
                    n_simu="params:N_SIMULATIONS",
                    X_train_full="X_train_trunc",
                    y_train_full="y_train_trunc",
                    X_test="X_test",
                    y_test="y_test",
                    K_FIXE="K_FIXE",
                    n_init="params:N_INIT"
                    ),
                outputs="al_perfs",
                tags=["sampling", "active_sampling"]
            ),
            node(
                func=lambda_analysis,
                inputs=dict(
                    b="params:BATCH_SIZE",
                    budget="params:BUDGET",
                    n_simu="params:N_SIMULATIONS",
                    X_train_full="X_train_trunc",
                    y_train_full="y_train_trunc",
                    X_test="X_test",
                    y_test="y_test",
                    n_init="params:N_INIT",
                    K_FIXE="K_FIXE"
                    ),
                outputs="al_lam_perfs",
                tags=["sampling", "active_lambda_sampling"]
            ),
            node(
                func=pl_performances,
                inputs=dict(
                    bs="params:BATCH_SEQ",
                    budget="params:BUDGET",
                    n_simu="params:N_SIMULATIONS",
                    X_train_full="X_train_trunc",
                    y_train_full="y_train_trunc",
                    X_test="X_test",
                    y_test="y_test",
                    n_init="params:N_INIT"
                    ),
                outputs="pl_perfs",
                tags=["sampling", "passive_sampling"]
            ),
            node(
                func=b_descent_analysis,
                inputs=dict(
                    budget="params:BUDGET",
                    n_simu="params:N_SIMULATIONS",
                    X_train_full="X_train_trunc",
                    y_train_full="y_train_trunc",
                    X_test="X_test",
                    y_test="y_test",
                    n_init="params:N_INIT",
                    b_descent_size="params:BATCH_DESCENT_SIZE",
                    K_FIXE="K_FIXE"
                    ),
                outputs="b_descent_perfs",
                tags=["sampling", "active_descent_sampling"]
            ),
            node(
                func=b_ascent_analysis,
                inputs=dict(
                    budget="params:BUDGET",
                    n_simu="params:N_SIMULATIONS",
                    X_train_full="X_train_trunc",
                    y_train_full="y_train_trunc",
                    X_test="X_test",
                    y_test="y_test",
                    n_init="params:N_INIT",
                    b_ascent_size="params:BATCH_DESCENT_SIZE",
                    K_FIXE="K_FIXE"
                    ),
                outputs="b_ascent_perfs",
                tags=["sampling", "active_descent_sampling"]
            ),
        ]
    )
예제 #29
0
 def test_tag_nodes_single_tag(self):
     tagged_node = node(identity, "input", "output",
                        tags="hello").tag("world")
     assert "hello" in tagged_node.tags
     assert "world" in tagged_node.tags
     assert len(tagged_node.tags) == 2
예제 #30
0
 def test_updated_partial(self):
     n = node(update_wrapper(partial(identity), identity), ["in"], ["out"])
     assert str(n) == "identity([in]) -> [out]"
     assert n.name == "identity([in]) -> [out]"
     assert n.short_name == "Identity"