Пример #1
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {0}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    code = []
    if (error == ErrorTypes.NO_ERROR):
        error, is_schema_appropriate = DataSourceValidityChecker.check_validity(
            node)
        if (error == ErrorTypes.NO_ERROR):
            # Must be a valid schema at this point.
            code.append("schema_" + node["id"] + "=")
            code.extend([
                CodeGenerationUtils.arrange_schema(
                    node["parameter"]["schema"]), os.linesep
            ])

            code.extend([
                "df_" + node["id"] + ' = spark.readStream.schema(schema_' +
                node["id"] + ")." + node["file_type"] + "(" +
                CodeGenerationUtils.arrange_parameter_value(
                    node["parameters"]["file_path"]) + ")", os.linesep
            ])

    return code, error
Пример #2
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if (error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        if (error == ErrorTypes.NO_ERROR):
            my_args = {
                "node_id": node["id"],
                "input_dfs": [df_name],
                "shared_function_set": shared_function_set,
                "additional_local_code": additional_local_code,
                "errors": errors
            }
            gen_code = CodeGenerationUtils.handle_instantination_or_call(
                node["parameters"], 'df_' + node["id"] + '=' + df_name + '.' +
                node["ddfo_name"] + '(', my_args)

            final_code = CodeGenerationUtils.merge_with_additional_code(
                gen_code, additional_local_code)

    return final_code, shared_function_set, error
Пример #3
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist={"df_count": {1}, "model_count": {0}}
    error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist)
    code=[]
    if(error == ErrorTypes.NO_ERROR):
        if(bool(extra["dfs"])):
            df_name="df_"+extra["dfs"][0]
        else:
            df_name = "df_" + extra["portions"][0][0] + "[" + str(extra["portions"][0][1]) + "]"

        code = ['estimator_' + node["id"] + ' = ' + node["estimator_name"] + '(']
        for param in node["parameters"]:
            code.extend([param + "=" + CodeGenerationUtils.arrange_parameter_value(node["parameters"][param]), ", "])
        if (len(node["parameters"]) > 0):
            code.pop()
        code.extend([")", os.linesep])

        code.extend(['model_' + node["id"] + "=" + 'estimator_' + node["id"] + ".fit(df_" + df_name + ")", os.linesep])
        code.extend(['df_' + node["id"] + "=" + 'model_' + node["id"] + '.transform(df_' + df_name + ')', os.linesep])

    return code, error
Пример #4
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    code = []
    if (error == ErrorTypes.NO_ERROR):
        if (bool(extra["dfs"])):
            df_name = "df_" + extra["dfs"][0]
        else:
            df_name = "df_" + extra["portions"][0][0] + "[" + str(
                extra["portions"][0][1]) + "]"

        code.append(
            df_name + '.selectExpr("CAST(' +
            node["parameters"]["unique_column_name"] +
            ' AS STRING) AS key", "to_json(struct(*)) AS value").write.format("kafka").option("kafka.bootstrap.servers", '
        )
        code.append(
            CodeGenerationUtils.arrange_parameter_value(
                node["parameters"]["host"] + ":" +
                node["parameters"]["port"]) + ")")
        code.extend([
            '.option("topic", ' + CodeGenerationUtils.arrange_parameter_value(
                node["parameters"]["topic"]) + ").save()", os.linesep
        ])

    return code, error
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist={"df_count": {0}, "model_count": {0}}
    error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist)
    final_code=[]
    shared_function_set = set()
    additional_local_code=[]
    errors=[]
    if(error == ErrorTypes.NO_ERROR):
        error, is_schema_appropriate=DataSourceValidityChecker.check_validity(node)
        if(error == ErrorTypes.NO_ERROR):
            my_args = {"node_id": node["id"], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors}
            # Must be a valid schema at this point.
            additional_code, param_string = CodeGenerationUtils.handle_parameter(node["parameter"]["schema"], my_args)
            gen_code=[]
            gen_code.extend(additional_code)

            gen_code.append("df_" + node["id"] + ' = spark.readStream.format("kafka").option("kafka.bootstrap.servers", ')
            gen_code.append(CodeGenerationUtils.handle_primitive(node["parameters"]["host"]["value"] + ":" + node["parameters"]["port"]["value"]) + ")")
            gen_code.append('.option("subscribe", ' + CodeGenerationUtils.handle_primitive(node["parameters"]["topic"]["value"] + ")"))
            gen_code.append('.load().select(from_json(col("value").cast("string"), '+ param_string +")")
            # For streams, we will use timestamp as a key while writing to kafka topic in case.
            gen_code.extend(['.alias("value"), "timestamp").select("value.*", "timestamp")', os.linesep])

            final_code = CodeGenerationUtils.merge_with_additional_code(gen_code, additional_local_code)

    return final_code, shared_function_set, error
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    shared_function_set = set()
    errors = []
    code = []
    if (error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        shared_function_set.add(SharedFunctionTypes.VECTOR_DISASSEMBLER)
        code = [
            "df_" + node["id"] + " = " + "vector_disassembler(" + df_name +
            ", " + CodeGenerationUtils.handle_primitive(
                node["parameters"]["vector_column"]["value"]) + ")", os.linesep
        ]

    return code, shared_function_set, error
Пример #7
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {1}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    code = []
    shared_function_set = set()
    if (error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        model_id = extra["models"][0]["source_id"]

        code = [
            "df_" + node["id"] + "=" + "model_" + model_id + ".transform(" +
            df_name + ")", os.linesep
        ]

    return code, shared_function_set, error
Пример #8
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist={"df_count": {1}, "model_count": {0}}
    error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist)
    final_code=[]
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if(error == ErrorTypes.NO_ERROR):
        if("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        my_args = {"node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors}
        # Depending on the column that multi_instance_indicator indicates, we will decide to apply whether to multi-instance generation or usual generation
        if(MultiInstanceHandlerUtils.should_generate_multiple_instances(node)):
            gen_code = MultiInstanceHandlerUtils.multi_instance_generation(node, df_name, my_args)
        else:
            gen_code = __single_generation(node, df_name, my_args)

        final_code = CodeGenerationUtils.merge_with_additional_code(gen_code, additional_local_code)

    return final_code, shared_function_set, error
Пример #9
0
def generate_code(args):
    node=args["node"]
    requireds_info=args["requireds_info"]
    edges=args["edges"]

    checklist={"df_count": {0}, "model_count": {0}}
    error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist)
    code=[]
    if(error == ErrorTypes.NO_ERROR):
        error, is_schema_appropriate=DataSourceValidityChecker.check_validity(node)
        if(error == ErrorTypes.NO_ERROR):
            remaining_params = node["parameters"].keys()
            remaining_params.remove("file_path")
            if(is_schema_appropriate):
                code.append("schema_"+node["id"]+"=")
                code.extend([CodeGenerationUtils.arrange_schema(node["parameter"]["schema"]), os.linesep])
                code.append("df_" + node["id"] + "=" + "spark.read."+ node["file_type"] +"(path=" + CodeGenerationUtils.arrange_parameter_value(node["parameters"]["file_path"] + ", " + "schema="+ "schema_"+node["id"]))
                remaining_params.remove("schema")
            else:
                if(node["can_infer_schema"]):
                    code.append("df_" + node["id"] + "=" + "spark.read." + node["file_type"] + "(path=" + CodeGenerationUtils.arrange_parameter_value(node["parameters"]["file_path"]) +", " +"inferSchema="+"True")
                else:
                    code.append("df_" + node["id"] + "=" + "spark.read." + node["file_type"] + "(path=" + CodeGenerationUtils.arrange_parameter_value(node["parameters"]["file_path"]))

            for param in remaining_params:
                code.extend([", " + param + "=" + CodeGenerationUtils.arrange_parameter_value(node["parameters"][param])])
            code.extend([")", os.linesep])

    return code, error
Пример #10
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    code = []
    if (error == ErrorTypes.NO_ERROR):
        if (bool(extra["dfs"])):
            df_name = "df_" + extra["dfs"][0]
        else:
            df_name = "df_" + extra["portions"][0][0] + "[" + str(
                extra["portions"][0][1]) + "]"

        code.extend([
            "df_" + node["id"] + "=" + df_name + ".randomSplit(" +
            CodeGenerationUtils.arrange_parameter_value(
                node["parameters"]["weights"]) + ", " +
            CodeGenerationUtils.arrange_parameter_value(
                node["parameters"]["seed"]) + ")", os.linesep
        ])

    return code, error
Пример #11
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist={"df_count": {1}, "model_count": {0}}
    error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist)
    final_code=[]
    shared_function_set = set()
    additional_local_code=[]
    errors=[]
    if(error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        my_args={"node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors}

        updated_function_name = CodeGenerationUtils.handle_parameter(node["parameters"]["udf_function"], my_args)
        gen_code=[]
        gen_code.extend(["udf_"+node["id"]+" = udf("+updated_function_name+", "+node["parameters"]["udf_return_type"]["value"]+"())", os.linesep])

        gen_code.extend(["tuple_list = " + CodeGenerationUtils.handle_parameter(node["parameters"]["udf_input_tuples"], my_args), os.linesep])
        gen_code.extend(["output_list = " + CodeGenerationUtils.handle_parameter(node["parameters"]["udf_outputs"], my_args), os.linesep])
        gen_code.extend(["df_" + node["id"] + "=" + df_name, os.linesep])
        gen_code.extend(["for index in range(len(tuple_list)):", os.linesep])
        gen_code.extend(["\tdf_"+node["id"]+" = df_"+node["id"]+".withColumn(output_list[index], udf_"+node["id"]+"(*tuple_list[index]))", os.linesep, os.linesep])

        final_code = CodeGenerationUtils.merge_with_additional_code(gen_code, additional_local_code)

    return final_code, shared_function_set, error
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if (error == ErrorTypes.NO_ERROR):
        error, extra2 = CVValiditiyChecker.check_validity(
            node["nodes"], node["edges"])

        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        my_args = {
            "node_id": node["id"],
            "input_dfs": [df_name],
            "shared_function_set": shared_function_set,
            "additional_local_code": additional_local_code,
            "errors": errors
        }
        gen_code = []
        gen_code.extend(
            __generate_code_for_estimator_instantination(
                node["nodes"][extra2["estimator_node_id"]], my_args))
        gen_code.extend(
            __generate_code_for_evaluator_instantination(
                node["nodes"][extra2["evaluator_node_id"]], my_args))
        gen_code.extend(
            __generate_code_for_param_grid(
                node, 'estimator_' + extra2["estimator_node_id"], my_args))
        gen_code.extend(
            __generate_code_for_cv_instantination(node,
                                                  extra2["estimator_node_id"],
                                                  extra2["evaluator_node_id"]))

        gen_code.extend([
            'model_' + node["id"] + "=" + 'cv_' + node["id"] + ".fit(" +
            df_name + ")", os.linesep
        ])
        # Following might not be logical unless you aim to predict on training data for some specific needs.
        gen_code.extend([
            'df_' + node["id"] + "=" + 'model_' + node["id"] + '.transform(' +
            df_name + ')', os.linesep
        ])

        final_code = CodeGenerationUtils.merge_with_additional_code(
            gen_code, additional_local_code)

    return final_code, shared_function_set, error
Пример #13
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    final_code = []
    shared_function_set = set()
    additional_local_code = []
    errors = []

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    if (error == ErrorTypes.NO_ERROR):
        error, pipeline_order = PipelineValidityChecker.check_validity(
            node["nodes"], node["edges"])
        if (error == ErrorTypes.NO_ERROR):
            if ("portion" in extra["dfs"][0]):
                df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                    extra["dfs"][0]["portion"]) + "]"
            else:
                df_name = "df_" + extra["dfs"][0]["source_id"]

            my_args = {
                "node_id": node["id"],
                "input_dfs": [df_name],
                "shared_function_set": shared_function_set,
                "additional_local_code": additional_local_code,
                "errors": errors
            }
            gen_code, error = __generate_stages(node["nodes"], pipeline_order,
                                                df_name, my_args)
            if (error == ErrorTypes.NO_ERROR):
                gen_code.append(os.linesep)
                gen_code.extend(
                    __generate_code_for_pipeline_instantination(
                        node, pipeline_order, my_args))

                gen_code.extend([
                    'model_' + node["id"] + "=" + 'pipeline_' + node["id"] +
                    ".fit(" + df_name + ")", os.linesep
                ])
                # Following might not be logical for pipelines with an estimator
                gen_code.extend([
                    'df_' + node["id"] + "=" + 'model_' + node["id"] +
                    '.transform(' + df_name + ')', os.linesep
                ])

                final_code = CodeGenerationUtils.merge_with_additional_code(
                    gen_code, additional_local_code)

    return final_code, shared_function_set, error
Пример #14
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist={"df_count": {0}, "model_count": {0}}
    error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist)
    code=[]
    shared_function_set = set()
    if(error == ErrorTypes.NO_ERROR):

        code = ["model_" + node["id"] + "=" + node["parameters"]["model_type"]["value"] +".load(" + CodeGenerationUtils.handle_primitive(node["parameters"]["model_path"]["value"]) + ")", os.linesep]

    return code, shared_function_set, error
Пример #15
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if (error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        my_args = {
            "node_id": node["id"],
            "input_dfs": [df_name],
            "shared_function_set": shared_function_set,
            "additional_local_code": additional_local_code,
            "errors": errors
        }
        gen_code = []

        shared_function_set.add(SharedFunctionTypes.SELECT_EXPR_HELPERS)
        gen_code.append("df_" + node["id"] + "=" + df_name + ".selectExpr(")

        for expr in node["parameters"]["expressions"]["value"]:
            gen_code.extend([
                'single_select_expr_generator(' +
                CodeGenerationUtils.handle_parameter(expr["input_cols"],
                                                     my_args) +
                ', ' + CodeGenerationUtils.handle_parameter(
                    expr["output_cols"], my_args) +
                ', ' + CodeGenerationUtils.handle_parameter(
                    expr["operation"], my_args) + ')', ', '
            ])

        gen_code.pop()
        gen_code.extend([")", os.linesep])

        final_code = CodeGenerationUtils.merge_with_additional_code(
            gen_code, additional_local_code)

    return final_code, shared_function_set, error
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    code = []
    shared_function_set = set()
    if (error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        code.append(
            df_name + '.selectExpr("CAST(' +
            node["parameters"]["unique_column_name"]["value"] +
            ' AS STRING) AS key", "to_json(struct(*)) AS value").writeStream.format("kafka").option("kafka.bootstrap.servers", '
        )
        code.append(
            CodeGenerationUtils.handle_primitive(
                node["parameters"]["host"]["value"] + ":" +
                node["parameters"]["port"]["value"]) + ")")
        code.append(".trigger(" + __generate_trigger_code(node) + ")")
        code.append('.option("topic", ' + CodeGenerationUtils.handle_primitive(
            node["parameters"]["topic"]["value"]) + ")")
        code.append('.option("checkpointLocation", ' +
                    CodeGenerationUtils.handle_primitive(
                        node["parameters"]["checkpointLocation"]["value"]) +
                    ").start()")
        code.extend([
            os.linesep, "query_" + node["id"], ".awaitTermination()",
            os.linesep
        ])

        args["additional_info"]["written_topics"].append({
            "topic_name":
            node["parameters"]["topic"]["value"],
            "host":
            node["parameters"]["host"]["value"],
            "port":
            node["parameters"]["port"]["value"]
        })

    return code, shared_function_set, error
Пример #17
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {0}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if (error == ErrorTypes.NO_ERROR):
        error, is_schema_appropriate = DataSourceValidityChecker.check_validity(
            node)
        if (error == ErrorTypes.NO_ERROR):
            my_args = {
                "node_id": node["id"],
                "shared_function_set": shared_function_set,
                "additional_local_code": additional_local_code,
                "errors": errors
            }
            if (is_schema_appropriate):
                gen_code = CodeGenerationUtils.handle_instantination_or_call(
                    node["parameters"], "df_" + node["id"] + "=" +
                    "spark.read." + node["file_type"] + "(", my_args)
            else:
                # For safety, but consider it again
                if ("schema" in node["parameters"]):
                    del node["parameters"]["schema"]

                if (node["can_infer_schema"]):
                    node["parameters"]["inferSchema"] = {
                        "value": True,
                        "type": "boolean"
                    }

                gen_code = CodeGenerationUtils.handle_instantination_or_call(
                    node["parameters"],
                    "df_" + node["id"] + "=" + "spark.read.format(" +
                    CodeGenerationUtils.handle_primitive(node["file_type"]) +
                    ").load(", my_args)

                final_code = CodeGenerationUtils.merge_with_additional_code(
                    gen_code, additional_local_code)

    return final_code, shared_function_set, error
Пример #18
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {2}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    code = []
    if (error == ErrorTypes.NO_ERROR):
        df_names = __get_dfs_to_join(extra)
        code.extend([
            "df_" + node["id"] + "=" + df_names[0] + ".join(" + df_names[1] +
            ", " + CodeGenerationUtils.arrange_parameter_value(
                node["parameters"]["join_column"]) + ")", os.linesep
        ])

    return code, error
Пример #19
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    code = []
    if (error == ErrorTypes.NO_ERROR):
        error, extra2 = CVValiditiyChecker.check_validity(
            node["nodes"], node["edges"])

        if (bool(extra["dfs"])):
            df_name = "df_" + extra["dfs"][0]
        else:
            df_name = "df_" + extra["portions"][0][0] + "[" + str(
                extra["portions"][0][1]) + "]"

        code.extend(
            __generate_code_for_estimator_instantination(
                node["nodes"][extra2["estimator_node_id"]]))
        code.extend(
            __generate_code_for_evaluator_instantination(
                node["nodes"][extra2["evaluator_node_id"]]))
        code.extend(
            __generate_code_for_param_grid(
                node, 'estimator_' + extra2["estimator_node_id"]))
        code.extend(
            __generate_code_for_cv_instantination(node,
                                                  extra2["estimator_node_id"],
                                                  extra2["evaluator_node_id"]))

        code.extend([
            'model_' + node["id"] + "=" + 'cv_' + node["id"] + ".fit(" +
            df_name + ")", os.linesep
        ])
        # Following might not be logical unless you aim to predict on training data for some specific needs.
        code.extend([
            'df_' + node["id"] + "=" + 'model_' + node["id"] + '.transform(' +
            df_name + ')', os.linesep
        ])

    return code, error
Пример #20
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {0}, "model_count": {1}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    code = []
    if (error == ErrorTypes.NO_ERROR):

        model_id = extra["models"][0]

        code = [
            "model_" + model_id + ".save(" +
            CodeGenerationUtils.arrange_parameter_value(
                node["parameters"]["model_path"]) + ")", os.linesep
        ]

    return code, error
Пример #21
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist={"df_count": {1}, "model_count": {0}}
    error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist)
    final_code=[]
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if(error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        my_args = {"node_id": node["id"], "input_dfs": [df_name], "shared_function_set": shared_function_set, "additional_local_code": additional_local_code, "errors": errors}

        input_cols = CodeGenerationUtils.handle_parameter(node["parameters"]["input_cols"], my_args)
        output_cols = CodeGenerationUtils.handle_parameter(node["parameters"]["output_cols"], my_args)

        window_size = node["parameters"]["window_size"]["value"]
        partitioning_column = node["parameters"]["partitioning_column"]["value"]
        ordering_column = node["parameters"]["ordering_column"]["value"]
        ordering_direction = node["parameters"]["ordering_direction"]["value"]
        gen_code=[]
        gen_code.extend(["input_cols = " + output_cols, os.linesep])
        gen_code.extend(["output_cols = " + input_cols, os.linesep])
        gen_code.extend(["df_" + node["id"] + "=" + df_name, os.linesep])
        gen_code.extend(["for inC, outC in zip(input_cols, output_cols):", os.linesep])
        gen_code.extend(["\tdf_" + node["id"] + " = df_" + node["id"] + ".withColumn('temp', col(inC))", os.linesep])
        gen_code.extend(["\twSpec = Window.partitionBy('" + partitioning_column + "').orderBy(col('" + ordering_column + "')." + ordering_direction + "())", os.linesep])
        gen_code.extend(["\tfor j in range(" + str(window_size) + "):", os.linesep])
        gen_code.extend(["\t\tlag_values = lag('temp', default=0).over(wSpec)", os.linesep])
        gen_code.extend(["\t\tdf_" + node["id"] + " = df_" + node["id"] + ".withColumn('temp', F.when((col('temp')==1) | (lag_values==None) | (lag_values<1) | (lag_values>=" + str(window_size + 1) + "), col('temp')).otherwise(lag_values+1))", os.linesep])
        gen_code.extend(["\tdf_" + node["id"] + " = df_" + node["id"] + ".withColumn(outC, F.when(col('temp') > 0, 1.0).otherwise(0.0))", os.linesep])

        final_code = CodeGenerationUtils.merge_with_additional_code(gen_code, additional_local_code)

    return final_code, shared_function_set, error
Пример #22
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {0}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    code = []
    if (error == ErrorTypes.NO_ERROR):
        error, is_schema_appropriate = DataSourceValidityChecker.check_validity(
            node)
        if (error == ErrorTypes.NO_ERROR):
            # Must be a valid schema at this point.
            code.append("schema_" + node["id"] + "=")
            code.extend([
                CodeGenerationUtils.arrange_schema(
                    node["parameter"]["schema"]), os.linesep
            ])

            code.append(
                "df_" + node["id"] +
                ' = spark.readStream.format("kafka").option("kafka.bootstrap.servers", '
            )
            code.append(
                CodeGenerationUtils.arrange_parameter_value(
                    node["parameters"]["host"] + ":" +
                    node["parameters"]["port"]) + ")")
            code.append('.option("subscribe", ' +
                        CodeGenerationUtils.arrange_parameter_value(
                            node["parameters"]["topic"] + ")"))
            code.append(
                '.load().select(from_json(col("value").cast("string"), schema_'
                + node["id"] + ")")
            # For streams, we will use timestamp as a key while writing to kafka topic in case.
            code.extend([
                '.alias("value"), "timestamp").select("value.*", "timestamp")',
                os.linesep
            ])

    return code, error
Пример #23
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist={"df_count": {1}, "model_count": {0}}
    error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist)
    code=[]
    if(error == ErrorTypes.NO_ERROR):
        if (bool(extra["dfs"])):
            df_name = "df_" + extra["dfs"][0]
        else:
            df_name = "df_" + extra["portions"][0][0] + "[" + str(extra["portions"][0][1]) + "]"

        code.append("query_" + node["id"] + "=" + df_name + ".writeStream.format("+CodeGenerationUtils.arrange_parameter_value(node["file_type"])+")")
        code.append(".trigger("+ __generate_trigger_code(node) +")")
        code.append('.option("path", '+ CodeGenerationUtils.arrange_parameter_value(node["parameters"]["file_path"]) + ")")
        code.append('.option("checkpointLocation", ' + CodeGenerationUtils.arrange_parameter_value(node["parameters"]["checkpoint_path"]) + ").start()")
        code.extend([os.linesep, "query_" + node["id"], ".awaitTermination()", os.linesep])

    return code, error
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {0}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if (error == ErrorTypes.NO_ERROR):
        error, is_schema_appropriate = DataSourceValidityChecker.check_validity(
            node)
        if (error == ErrorTypes.NO_ERROR):
            my_args = {
                "node_id": node["id"],
                "shared_function_set": shared_function_set,
                "additional_local_code": additional_local_code,
                "errors": errors
            }
            # Must be a valid schema at this point.
            additional_code, param_string = CodeGenerationUtils.handle_parameter(
                node["parameter"]["schema"], my_args)
            gen_code = []
            gen_code.extend(additional_code)

            gen_code.extend([
                "df_" + node["id"] + ' = spark.readStream.schema(' +
                param_string + ")." + node["file_type"] + "(" +
                CodeGenerationUtils.handle_primitive(
                    node["parameters"]["path"]["value"]) + ")", os.linesep
            ])

            final_code = CodeGenerationUtils.merge_with_additional_code(
                gen_code, additional_local_code)

    return final_code, shared_function_set, error
Пример #25
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if (error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        my_args = {
            "node_id": node["id"],
            "input_dfs": [df_name],
            "shared_function_set": shared_function_set,
            "additional_local_code": additional_local_code,
            "errors": errors
        }

        gen_code = CodeGenerationUtils.handle_instantination_or_call(
            node["parameters"], df_name + ".write.format(" +
            CodeGenerationUtils.handle_primitive(node["file_type"]) +
            ").save(", my_args)

        final_code = CodeGenerationUtils.merge_with_additional_code(
            gen_code, additional_local_code)

        args["additional_info"]["written_tables"].append(
            {"table_path": node["parameters"]["path"]["value"]})

    return final_code, shared_function_set, error
Пример #26
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    if (error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        if (error == ErrorTypes.NO_ERROR):
            final_code = [
                "correlation_" + node["id"] + " = " + "Correlation.corr(" +
                df_name + ", " + CodeGenerationUtils.handle_primitive(
                    node["parameters"]["column"]["value"]) + ", " +
                CodeGenerationUtils.handle_primitive(
                    node["parameters"]["method"]["value"]) + ")", os.linesep
            ]
            final_code.extend([
                "result_array_" + node["id"] + " = ",
                "correlation_" + node["id"] + ".head()[0].toArray()",
                os.linesep
            ])
            # In the future, dynamically name columns according to an appropriate convention...
            final_code.extend([
                "df_" + node["id"] + " = sc.parallelize(" + "result_array_" +
                node["id"] + ")",
                ".map(lambda x: [float(i) for i in x]).toDF()", os.linesep
            ])

    return final_code, shared_function_set, error
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist={"df_count": {1}, "model_count": {0}}
    error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist)
    final_code=[]
    shared_function_set = set()
    if(error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        if(error==ErrorTypes.NO_ERROR):
            # For now, directly returns the correlation result (as a dataframe) which includes (in a row~):
            # pValues: DenseVector
            # degreesOfFreedom: list
            # statistics: DenseVector
            final_code=["df_"+ node["id"] + " = " + node["parameters"]["test_type"]["value"] + ".test(" + df_name + ", " + CodeGenerationUtils.handle_primitive(node["parameters"]["features_column"]["value"]) + ", " + CodeGenerationUtils.handle_primitive(node["parameters"]["label_column"]["value"]) + ")", os.linesep]

    return final_code, shared_function_set, error
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist={"df_count": {1}, "model_count": {0}}
    error, extra=IncomingEdgeValidityChecker.check_validity(node["id"], requireds_info, edges, checklist)
    code=[]
    shared_function_set = set()
    if(error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        code.append("query_" + node["id"] + "=" + df_name + ".writeStream.format(" + CodeGenerationUtils.handle_primitive(node["file_type"]) + ")")
        code.append(".trigger("+ __generate_trigger_code(node) +")")
        code.append('.option("path", ' + CodeGenerationUtils.handle_primitive(node["parameters"]["path"]["value"]) + ")")
        code.append('.option("checkpointLocation", ' + CodeGenerationUtils.handle_primitive(node["parameters"]["checkpointLocation"]["value"]) + ").start()")
        code.extend([os.linesep, "query_" + node["id"], ".awaitTermination()", os.linesep])

        args["additional_info"]["written_tables"].append({"table_path": node["parameters"]["path"]["value"]})

    return code, shared_function_set, error
Пример #29
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    code = []
    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    if (error == ErrorTypes.NO_ERROR):
        error, pipeline_order = PipelineValidityChecker.check_validity(
            node["nodes"], node["edges"])
        if (error == ErrorTypes.NO_ERROR):
            code.extend(__generate_stages(node["nodes"], pipeline_order))
            code.append(os.linesep)
            code.extend(
                __generate_code_for_pipeline_instantination(
                    node, pipeline_order))

            if (bool(extra["dfs"])):
                df_name = "df_" + extra["dfs"][0]
            else:
                df_name = "df_" + extra["portions"][0][0] + "[" + str(
                    extra["portions"][0][1]) + "]"

            code.extend([
                'model_' + node["id"] + "=" + 'pipeline_' + node["id"] +
                ".fit(" + df_name + ")", os.linesep
            ])
            # Following might not be logical for pipelines with an estimator
            code.extend([
                'df_' + node["id"] + "=" + 'model_' + node["id"] +
                '.transform(' + df_name + ')', os.linesep
            ])

    return code, error
Пример #30
0
def generate_code(args):
    node = args["node"]
    requireds_info = args["requireds_info"]
    edges = args["edges"]

    checklist = {"df_count": {1}, "model_count": {0}}
    error, extra = IncomingEdgeValidityChecker.check_validity(
        node["id"], requireds_info, edges, checklist)
    final_code = []
    shared_function_set = set()
    additional_local_code = []
    errors = []
    if (error == ErrorTypes.NO_ERROR):
        if ("portion" in extra["dfs"][0]):
            df_name = "df_" + extra["dfs"][0]["source_id"] + "[" + str(
                extra["dfs"][0]["portion"]) + "]"
        else:
            df_name = "df_" + extra["dfs"][0]["source_id"]

        my_args = {
            "node_id": node["id"],
            "input_dfs": [df_name],
            "shared_function_set": shared_function_set,
            "additional_local_code": additional_local_code,
            "errors": errors
        }

        df_name = "df_" + my_args["node_id"]
        gen_code = [df_name + " = " + my_args["input_dfs"][0], os.linesep]

        input_columns = ["["]
        conditions = ["["]
        values = ["["]
        otherwises = ["["]
        output_columns = ["["]
        for exp in node["parameters"]["expressions"]["value"]:
            input_columns.extend([
                CodeGenerationUtils.handle_parameter(exp["input_columns"],
                                                     my_args), ", "
            ])
            conditions.extend([
                CodeGenerationUtils.handle_parameter(exp["condition"],
                                                     my_args), ", "
            ])
            values.extend([
                CodeGenerationUtils.handle_parameter(exp["value"], my_args),
                ", "
            ])
            otherwises.extend([
                CodeGenerationUtils.handle_parameter(exp["otherwise"],
                                                     my_args), ", "
            ])
            output_columns.extend([
                CodeGenerationUtils.handle_parameter(exp["output_columns"],
                                                     my_args), ", "
            ])

        # Check there are at least 1 elememnt in expressions
        input_columns.pop()
        conditions.pop()
        values.pop()
        otherwises.pop()
        output_columns.pop()

        input_columns.extend(["]"])
        conditions.extend(["]"])
        values.extend(["]"])
        otherwises.extend(["]"])
        output_columns.extend(["]"])

        gen_code.extend(
            ["input_columns = " + ''.join(input_columns), os.linesep])
        gen_code.extend(["conditions = " + ''.join(conditions), os.linesep])
        gen_code.extend(["values = " + ''.join(values), os.linesep])
        gen_code.extend(["otherwises = " + ''.join(otherwises), os.linesep])
        gen_code.extend(
            ["output_columns = " + ''.join(output_columns), os.linesep])

        gen_code.extend([
            "for in_cols, cond, val, otw, out_cols in zip(input_columns, conditions, values, otherwises, output_columns):",
            os.linesep
        ])
        gen_code.extend(
            ["\tfor in_col, out_col in zip(in_cols, out_cols):", os.linesep])
        gen_code.extend([
            "\t\tcur_cond = eval(cond.replace('$','" + df_name +
            "[\"'+in_col+'\"]'" + "))", os.linesep
        ])
        gen_code.extend([
            "\t\t" + df_name + " = " + df_name +
            ".withColumn(out_col, F.when(cur_cond, val).otherwise(otw))",
            os.linesep
        ])

        final_code = CodeGenerationUtils.merge_with_additional_code(
            gen_code, additional_local_code)

    return final_code, shared_function_set, error