예제 #1
0
 async def multicomm_dataflow(self, config, request):
     # Seed the network with inputs given by caller
     # TODO(p0,security) allowlist of valid definitions to seed (set
     # Input.origin to something other than seed)
     inputs = []
     # If data was sent add those inputs
     if request.method == "POST":
         # Accept a list of input data
         # TODO validate that input data is dict of list of inputs each item
         # has definition and value properties
         for ctx, client_inputs in (await request.json()).items():
             for input_data in client_inputs:
                 if (not input_data["definition"]
                         in config.dataflow.definitions):
                     return web.json_response(
                         {
                             "error":
                             f"Missing definition for {input_data['definition']} in dataflow"
                         },
                         status=HTTPStatus.NOT_FOUND,
                     )
             inputs.append(
                 MemoryInputSet(
                     MemoryInputSetConfig(
                         ctx=StringInputSetContext(ctx),
                         inputs=[
                             Input(
                                 value=input_data["value"],
                                 definition=config.dataflow.definitions[
                                     input_data["definition"]],
                             ) for input_data in client_inputs
                         ],
                     )))
     # Run the operation in an orchestrator
     # TODO(dfass) Create the orchestrator on startup of the HTTP API itself
     async with MemoryOrchestrator.basic_config() as orchestrator:
         # TODO(dfass) Create octx on dataflow registration
         async with orchestrator(config.dataflow) as octx:
             results = {
                 str(ctx): result
                 async for ctx, result in octx.run(*inputs)
             }
             # TODO Implement input and presentation stages?
             """
             if config.presentation == "blob":
                 return web.Response(body=results)
             elif config.presentation == "text":
                 return web.Response(text=results)
             else:
             """
             return web.json_response(results)
예제 #2
0
 async def test_get_noun_chunks(self):
     input_sentence = (
         "The end is the beginning , and the beginning is the end"
     )
     async for ctx, results in MemoryOrchestrator.run(
         DataFlow.auto(get_noun_chunks, GetSingle),
         [
             Input(
                 value=[get_noun_chunks.op.outputs["result"].name],
                 definition=GetSingle.op.inputs["spec"],
             ),
             Input(
                 value=input_sentence,
                 definition=get_noun_chunks.op.inputs["text"],
             ),
             Input(
                 value="en_core_web_sm",
                 definition=get_noun_chunks.op.inputs["spacy_model"],
             ),
         ],
     ):
         noun_chunks = results[get_noun_chunks.op.outputs["result"].name]
         self.assertEqual(len(noun_chunks), 4)
예제 #3
0
파일: test_df.py 프로젝트: wandreuscv/dffml
 async def test_run(self):
     calc_strings_check = {"add 40 and 2": 42, "multiply 42 and 10": 420}
     async with MemoryOrchestrator.basic_config(*OPIMPS) as orchestrator:
         async with orchestrator() as octx:
             for to_calc in calc_strings_check.keys():
                 await octx.ictx.sadd(
                     to_calc,
                     Input(
                         value=to_calc,
                         definition=parse_line.op.inputs["line"],
                     ),
                     Input(
                         value=[add.op.outputs["sum"].name],
                         definition=GetSingle.op.inputs["spec"],
                     ),
                 )
             async for ctx, results in octx.run_operations():
                 ctx_str = (await ctx.handle()).as_string()
                 results = results[GetSingle.op.name]
                 self.assertEqual(
                     calc_strings_check[ctx_str],
                     results[add.op.outputs["sum"].name],
                 )
예제 #4
0
 async def setUp(self):
     self.dataflow = DataFlow(
         operations={
             "get_circle": get_circle.op,
             "get_single": GetSingle.imp.op,
         },
         seed=[
             Input(
                 value=[get_circle.op.outputs["shape"].name],
                 definition=GetSingle.op.inputs["spec"],
             )
         ],
         implementations={"get_circle": get_circle.imp},
     )
예제 #5
0
 async def test_run(self):
     dataflow = DataFlow.auto(*OPIMPS)
     calc_strings_check = {"add 40 and 2": 42, "multiply 42 and 10": 420}
     async with MemoryOrchestrator.withconfig({}) as orchestrator:
         async with orchestrator(dataflow) as octx:
             async for ctx, results in octx.run({
                     to_calc: [
                         Input(
                             value=to_calc,
                             definition=calc_parse_line.op.inputs["line"],
                         ),
                         Input(
                             value=[calc_add.op.outputs["sum"].name],
                             definition=GetSingle.op.inputs["spec"],
                         ),
                     ]
                     for to_calc in calc_strings_check.keys()
             }):
                 ctx_str = (await ctx.handle()).as_string()
                 self.assertEqual(
                     calc_strings_check[ctx_str],
                     results[calc_add.op.outputs["sum"].name],
                 )
예제 #6
0
파일: test_io.py 프로젝트: up1512001/dffml
 async def test_print_output(self):
     test_inputs = [
         Input(
             value="Testing print_output",
             definition=self.OutputDataflow.definitions["DataToPrint"],
             parents=None,
         )
     ]
     async with MemoryOrchestrator.withconfig({}) as orchestrator:
         async with orchestrator(self.OutputDataflow) as octx:
             with contextlib.redirect_stdout(self.stdout):
                 async for ctx_str, _ in octx.run(test_inputs):
                     results = self.stdout.getvalue()
                     self.assertIn("Testing print_output", results)
예제 #7
0
 async def test_one_hot_encoder(self):
     categories = [["Male", "Female"], [1, 2, 3]]
     input_data = [["Female", 1], ["Male", 3]]
     output_data = [[0.0, 1.0, 1.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0, 1.0]]
     async for ctx, results in MemoryOrchestrator.run(
             DataFlow.auto(one_hot_encoder, GetSingle),
         [
             Input(
                 value=[one_hot_encoder.op.outputs["result"].name],
                 definition=GetSingle.op.inputs["spec"],
             ),
             Input(
                 value=input_data,
                 definition=one_hot_encoder.op.inputs["data"],
             ),
             Input(
                 value=categories,
                 definition=one_hot_encoder.op.inputs["categories"],
             ),
         ],
     ):
         self.assertTrue((results[one_hot_encoder.op.outputs["result"].name]
                          == output_data).all())
예제 #8
0
 async def test_lemmatizer(self):
     input_sentence = (
         "The end is the beginning , and the beginning is the end"
     )
     async for ctx, results in MemoryOrchestrator.run(
         DataFlow.auto(lemmatizer, GetSingle),
         [
             Input(
                 value=[lemmatizer.op.outputs["result"].name],
                 definition=GetSingle.op.inputs["spec"],
             ),
             Input(
                 value=input_sentence,
                 definition=lemmatizer.op.inputs["text"],
             ),
             Input(
                 value="en_core_web_sm",
                 definition=lemmatizer.op.inputs["spacy_model"],
             ),
         ],
     ):
         lemma_list = results[lemmatizer.op.outputs["result"].name]
         self.assertEqual(len(input_sentence.split()), len(lemma_list))
예제 #9
0
 async def test_run(self):
     packages = {
         "http://pkg.freebsd.org/FreeBSD:13:amd64/latest/All/ImageMagick7-7.0.8.48.txz":
         {},
         "https://download.clearlinux.org/releases/10540/clear/x86_64/os/Packages/sudo-setuid-1.8.17p1-34.x86_64.rpm":
         {
             "./usr/bin/sudo": True
         },
         "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/20/Everything/x86_64/os/Packages/c/curl-7.32.0-3.fc20.x86_64.rpm":
         {
             "./usr/bin/curl": False
         },
     }
     found = dict(zip(packages.keys(), [False] * len(packages)))
     async for ctx, results in MemoryOrchestrator.run(
             dataflow,
         {
             URL: [
                 Input(value=URL,
                       definition=URLToURLBytes.op.inputs["URL"]),
                 Input(
                     value=["rpm_filename", "binary_is_PIE"],
                     definition=Associate.op.inputs["spec"],
                 ),
             ]
             for URL in packages
         },
             strict=True,
     ):
         package_url = (await ctx.handle()).as_string()
         with self.subTest(package_url=package_url):
             self.assertIn("binary_is_PIE", results)
             self.assertDictEqual(results["binary_is_PIE"],
                                  packages[package_url])
         found[package_url] = True
     self.assertTrue(all(found.values()),
                     "Not all packages we analyized: f{found}")
예제 #10
0
 async def test_tfidf_vectorizer(self):
     input_sentence = [
         "The end is the beginning. The beginning is the end."
     ]
     async for ctx, results in MemoryOrchestrator.run(
         DataFlow.auto(tfidf_vectorizer, GetSingle),
         [
             Input(
                 value=[tfidf_vectorizer.op.outputs["result"].name],
                 definition=GetSingle.op.inputs["spec"],
             ),
             Input(
                 value=input_sentence,
                 definition=tfidf_vectorizer.op.inputs["text"],
             ),
             Input(
                 value=[1, 1],
                 definition=count_vectorizer.op.inputs["ngram_range"],
             ),
             Input(
                 value=True,
                 definition=tfidf_vectorizer.op.inputs["get_feature_names"],
             ),
         ],
     ):
         vectors = results[tfidf_vectorizer.op.outputs["result"].name][0]
         features = results[tfidf_vectorizer.op.outputs["result"].name][1]
         self.assertTrue(isinstance(features, list))
         self.assertTrue(isinstance(vectors, np.ndarray))
         unique_tokens = list(
             set(input_sentence[0].lower().replace(".", "").split())
         )
         self.assertEqual(len(vectors[0]), len(unique_tokens))
         self.assertEqual(
             set(features).intersection(set(unique_tokens)), set(features)
         )
예제 #11
0
 async def test_resize(self):
     async for ctx, results in MemoryOrchestrator.run(
             DataFlow.auto(resize, GetSingle),
         [
             Input(
                 value=[
                     resize.op.outputs["result"].name,
                 ],
                 definition=GetSingle.op.inputs["spec"],
             ),
             Input(
                 value=self.INPUT_ARRAY,
                 definition=resize.op.inputs["src"],
             ),
             Input(
                 value=[50, 50, 3],
                 definition=resize.op.inputs["dsize"],
             ),
         ],
     ):
         self.assertEqual(
             results[resize.op.outputs["result"].name].shape,
             (50, 50, 3),
         )
예제 #12
0
 async def test_get_embedding(self):
     input_sentence = (
         "The end is the beginning , and the beginning is the end"
     )
     max_sentence_len = 15
     async for ctx, results in MemoryOrchestrator.run(
         DataFlow.auto(get_embedding, GetSingle),
         [
             Input(
                 value=[get_embedding.op.outputs["embedding"].name],
                 definition=GetSingle.op.inputs["spec"],
             ),
             Input(
                 value=input_sentence,
                 definition=get_embedding.op.inputs["text"],
             ),
             Input(
                 value="en_core_web_sm",
                 definition=get_embedding.op.inputs["spacy_model"],
             ),
             Input(
                 value=max_sentence_len,
                 definition=get_embedding.op.inputs["max_len"],
             ),
             Input(
                 value="<PAD>",
                 definition=get_embedding.op.inputs["pad_token"],
             ),
         ],
     ):
         embeddings = results[get_embedding.op.outputs["embedding"].name]
         self.assertEqual(max_sentence_len, len(embeddings))
         self.assertEqual(
             embeddings[randint(0, max_sentence_len - 1)].shape,
             embeddings[randint(0, max_sentence_len - 1)].shape,
         )
예제 #13
0
파일: df.py 프로젝트: jankeromnes/dffml
 async def records(self) -> AsyncIterator[Record]:
     async for record in self.sctx.records():
         async for ctx, result in MemoryOrchestrator.run(
                 self.parent.config.dataflow,
             [
                 Input(
                     value=record.feature(feature.name),
                     definition=Definition(name=feature.name,
                                           primitive=str(feature.dtype())),
                 ) for feature in self.parent.config.features
             ],
         ):
             if result:
                 record.evaluated(result)
             yield record
예제 #14
0
    async def test_run(self):
        repos = [
            "http://pkg.freebsd.org/FreeBSD:13:amd64/latest/All/ImageMagick7-7.0.8.48.txz",
            "https://download.clearlinux.org/releases/10540/clear/x86_64/os/Packages/sudo-setuid-1.8.17p1-34.x86_64.rpm",
            "https://rpmfind.net/linux/fedora/linux/updates/29/Everything/x86_64/Packages/g/gzip-1.9-9.fc29.x86_64.rpm",
            "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/20/Everything/x86_64/os/Packages/c/curl-7.32.0-3.fc20.x86_64.rpm",
        ]

        dataflow = DataFlow.auto(
            URLToURLBytes,
            files_in_rpm,
            urlbytes_to_rpmfile,
            urlbytes_to_tarfile,
            is_binary_pie,
            Associate,
            cleanup_rpm,
        )
        async with MemoryOrchestrator.withconfig({}) as orchestrator:

            definitions = Operation.definitions(*OPERATIONS)

            async with orchestrator(dataflow) as octx:
                async for ctx, results in octx.run(
                    {
                        URL: [
                            Input(value=URL, definition=definitions["URL"]),
                            Input(
                                value=["rpm_filename", "binary_is_PIE"],
                                definition=definitions["associate_spec"],
                            ),
                        ]
                        for URL in repos
                    },
                    strict=True,
                ):
                    self.assertTrue(results)
예제 #15
0
    async def setUp(self):
        dataflow = DataFlow(
            operations={
                "announce": announce.op,
                "get_single": GetSingle.imp.op,
            },
            seed=[
                Input(
                    value=[announce.op.outputs["string_out"].name],
                    definition=GetSingle.op.inputs["spec"],
                )
            ],
            implementations={announce.op.name: announce.imp},
        )

        self.dataflow = dataflow
예제 #16
0
    async def test_condition_does_not_run(self):
        ran = []

        @op(conditions=[CONDITION])
        async def condition_test(hi: str):
            ran.append(True)

        async with MemoryOrchestrator() as orchestrator:
            async with orchestrator(DataFlow(condition_test)) as octx:
                async for _ in octx.run([
                        Input(
                            value=True,
                            definition=condition_test.op.inputs["hi"],
                        ),
                ]):
                    pass

        self.assertFalse(ran)
예제 #17
0
async def run_dataflow(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
    inputs_created = {}
    definitions = self.config.dataflow.definitions

    for ctx_str, val_defs in inputs.items():
        inputs_created[ctx_str] = [
            Input(
                value=val_def["value"],
                definition=definitions[val_def["definition"]],
            ) for val_def in val_defs
        ]

    async with self.octx.parent(self.config.dataflow) as octx:
        results = [{
            (await ctx.handle()).as_string(): result
        } async for ctx, result in octx.run(inputs_created)]

    return {"results": results}
예제 #18
0
 async def run(self):
     # Create an Orchestrator which will manage the running of our operations
     async with MemoryOrchestrator.withconfig({}) as orchestrator:
         # Create a orchestrator context, everything in DFFML follows this
         # one-two context entry pattern
         async with orchestrator(DATAFLOW) as octx:
             # Run all the operations, Each iteration of this loop happens
             # when all inputs are exhausted for a context, the output
             # operations are then run and their results are yielded
             async for package_name, results in octx.run({
                     # For each package add a new input set to the input network
                     # The context operations execute under is the package name
                     # to evaluate. Contexts ensure that data pertaining to
                     # package A doesn't mingle with data pertaining to package B
                     package_name:
                 [
                     # The only input to the operations is the package name.
                     Input(
                         value=package_name,
                         definition=pypi_package_json.op.inputs["package"],
                     )
                 ]
                     for package_name in self.packages
             }):
                 # Grab the number of safety issues and the bandit report
                 # from the results dict
                 safety_issues = results[
                     safety_check.op.outputs["issues"].name]
                 bandit_report = results[
                     run_bandit.op.outputs["report"].name]
                 # Decide if those numbers mean we should stop ship or not
                 if (safety_issues > 0 or
                         bandit_report["CONFIDENCE.HIGH_AND_SEVERITY.HIGH"]
                         > 5):
                     print(f"Do not install {package_name}!")
                     for definition_name, result in results.items():
                         print(f"    {definition_name}: {result}")
                 else:
                     print(f"{package_name} is okay to install")
예제 #19
0
async def run_dataflow(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
    """
        Starts a subflow `self.config.dataflow` and runs `inputs` in it.

        Parameters:
            inputs: Dict[str,Any] ->
                eg: {
                    "ctx_str" : [
                        {
                            "value":val1,
                            "defintion":defintion1
                        },
                        {
                            "value":val2,
                            "defintion":defintion2
                        }
                    ]
                }
        Returns:
            Dict[str,Any] -> maps context strings in inputs to output after running
                            through dataflow
    """
    inputs_created = {}
    definitions = self.config.dataflow.definitions

    for ctx_str, val_defs in inputs.items():
        inputs_created[ctx_str] = [
            Input(
                value=val_def["value"],
                definition=definitions[val_def["definition"]],
            ) for val_def in val_defs
        ]
    async with self.subflow(self.config.dataflow) as octx:
        results = [{
            (await ctx.handle()).as_string(): result
        } async for ctx, result in octx.run(inputs_created)]

    return {"results": results}
예제 #20
0
파일: dataflow.py 프로젝트: emrul/dffml
    async def run_default(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
        """
        The default implementation for the dataflow.run operation is the uctx
        mode. This mode is when we map unique strings to a list of inputs to be
        given to the respective string's context.
        """
        inputs_created = {}
        definitions = self.config.dataflow.definitions

        for ctx_str, val_defs in inputs.items():
            inputs_created[ctx_str] = [
                Input(
                    value=val_def["value"],
                    definition=definitions[val_def["definition"]],
                )
                for val_def in val_defs
            ]
        async with self.subflow(self.config.dataflow) as octx:
            results = [
                {(await ctx.handle()).as_string(): result}
                async for ctx, result in octx.run(inputs_created)
            ]

        return {"results": results}
예제 #21
0
    pypi_package_url,
    pypi_package_contents,
    cleanup_pypi_package,
    safety_check,
    run_bandit,
    GetSingle,
)
# Seed inputs are added to each executing context. The following Input tells the
# GetSingle output operation that we want the output of the network to include
# data matching the "issues" output of the safety_check operation, and the
# "report" output of the run_bandit operation, for each context.
DATAFLOW.seed.append(
    Input(
        value=[
            safety_check.op.outputs["issues"].name,
            run_bandit.op.outputs["report"].name,
        ],
        definition=GetSingle.op.inputs["spec"],
    ))


class Install(CMD):

    arg_packages = Arg("packages",
                       nargs="+",
                       help="Package to check if we should install")

    async def run(self):
        # Create an Orchestrator which will manage the running of our operations
        async with MemoryOrchestrator.withconfig({}) as orchestrator:
            # Create a orchestrator context, everything in DFFML follows this
예제 #22
0
 async def test_run(self):
     calc_strings_check = {"add 40 and 2": 42, "multiply 42 and 10": 420}
     # TODO(p0) Implement and test asyncgenerator
     callstyles_no_expand = [
         "asyncgenerator",
         "dict",
         "dict_custom_input_set_context",
     ]
     callstyles = {
         "dict": {
             to_calc: [
                 Input(value=to_calc,
                       definition=parse_line.op.inputs["line"]),
                 Input(
                     value=[add.op.outputs["sum"].name],
                     definition=GetSingle.op.inputs["spec"],
                 ),
             ]
             for to_calc in calc_strings_check.keys()
         },
         "dict_custom_input_set_context": {
             CustomInputSetContext(to_calc): [
                 Input(value=to_calc,
                       definition=parse_line.op.inputs["line"]),
                 Input(
                     value=[add.op.outputs["sum"].name],
                     definition=GetSingle.op.inputs["spec"],
                 ),
             ]
             for to_calc in calc_strings_check.keys()
         },
         "list_input_sets": [
             MemoryInputSet(
                 MemoryInputSetConfig(
                     ctx=StringInputSetContext(to_calc),
                     inputs=[
                         Input(
                             value=to_calc,
                             definition=parse_line.op.inputs["line"],
                         ),
                         Input(
                             value=[add.op.outputs["sum"].name],
                             definition=GetSingle.op.inputs["spec"],
                         ),
                     ],
                 )) for to_calc in calc_strings_check.keys()
         ],
         "uctx": [[
             Input(value=to_calc, definition=parse_line.op.inputs["line"]),
             Input(
                 value=[add.op.outputs["sum"].name],
                 definition=GetSingle.op.inputs["spec"],
             ),
         ] for to_calc in calc_strings_check.keys()],
     }
     async with self.create_octx() as octx:
         for callstyle, inputs in callstyles.items():
             with self.subTest(callstyle=callstyle):
                 if callstyle in callstyles_no_expand:
                     run_coro = self.run_dataflow(octx, inputs)
                 else:
                     run_coro = self.run_dataflow(octx, *inputs)
                 async for ctx, results in run_coro:
                     ctx_str = (await ctx.handle()).as_string()
                     if callstyle == "uctx":
                         self.assertIn(
                             results[add.op.outputs["sum"].name],
                             dict(
                                 zip(
                                     calc_strings_check.values(),
                                     calc_strings_check.keys(),
                                 )),
                         )
                     else:
                         if callstyle == "dict_custom_input_set_context":
                             self.assertTrue(
                                 isinstance(ctx, CustomInputSetContext))
                         self.assertEqual(
                             calc_strings_check[ctx_str],
                             results[add.op.outputs["sum"].name],
                         )
 async def test_run(self):
     calc_strings_check = {"add 40 and 2": 42, "multiply 42 and 10": 420}
     dataflow = DataFlow.auto(*OPIMPS)
     # TODO(p0) Implement and test asyncgenerator
     callstyles_no_expand = ["asyncgenerator", "dict"]
     callstyles = {
         "dict": {
             to_calc: [
                 Input(value=to_calc,
                       definition=parse_line.op.inputs["line"]),
                 Input(
                     value=[add.op.outputs["sum"].name],
                     definition=GetSingle.op.inputs["spec"],
                 ),
             ]
             for to_calc in calc_strings_check.keys()
         },
         "list_input_sets": [
             MemoryInputSet(
                 MemoryInputSetConfig(
                     ctx=StringInputSetContext(to_calc),
                     inputs=[
                         Input(
                             value=to_calc,
                             definition=parse_line.op.inputs["line"],
                         ),
                         Input(
                             value=[add.op.outputs["sum"].name],
                             definition=GetSingle.op.inputs["spec"],
                         ),
                     ],
                 )) for to_calc in calc_strings_check.keys()
         ],
         "uctx": [[
             Input(value=to_calc, definition=parse_line.op.inputs["line"]),
             Input(
                 value=[add.op.outputs["sum"].name],
                 definition=GetSingle.op.inputs["spec"],
             ),
         ] for to_calc in calc_strings_check.keys()],
     }
     async with MemoryOrchestrator.withconfig({}) as orchestrator:
         async with orchestrator(dataflow) as octx:
             for callstyle, inputs in callstyles.items():
                 with self.subTest(callstyle=callstyle):
                     if callstyle in callstyles_no_expand:
                         run_coro = octx.run(inputs)
                     else:
                         run_coro = octx.run(*inputs)
                     async for ctx, results in run_coro:
                         ctx_str = (await ctx.handle()).as_string()
                         if callstyle == "uctx":
                             self.assertIn(
                                 results[add.op.outputs["sum"].name],
                                 dict(
                                     zip(
                                         calc_strings_check.values(),
                                         calc_strings_check.keys(),
                                     )),
                             )
                         else:
                             self.assertEqual(
                                 calc_strings_check[ctx_str],
                                 results[add.op.outputs["sum"].name],
                             )
예제 #24
0
    async def _multicomm_dataflow(self, config, request):
        # Seed the network with inputs given by caller
        # TODO(p0,security) allowlist of valid definitions to seed (set
        # Input.origin to something other than seed)
        inputs = []
        # If data was sent add those inputs
        if request.method == "POST":
            # Accept a list of input data according to config.input_mode
            if config.input_mode == "default":
                # TODO validate that input data is dict of list of inputs each item
                # has definition and value properties
                for ctx, client_inputs in (await request.json()).items():
                    for input_data in client_inputs:
                        if (not input_data["definition"]
                                in config.dataflow.definitions):
                            return web.json_response(
                                {
                                    "error":
                                    f"Missing definition for {input_data['definition']} in dataflow"
                                },
                                status=HTTPStatus.NOT_FOUND,
                            )

                    inputs.append(
                        MemoryInputSet(
                            MemoryInputSetConfig(
                                ctx=StringInputSetContext(ctx),
                                inputs=[
                                    Input(
                                        value=input_data["value"],
                                        definition=config.dataflow.definitions[
                                            input_data["definition"]],
                                    ) for input_data in client_inputs
                                ] + ([
                                    Input(
                                        value=request.headers,
                                        definition=config.dataflow.definitions[
                                            config.forward_headers],
                                    )
                                ] if config.forward_headers else []),
                            )))
            elif ":" in config.input_mode:
                preprocess_mode, *input_def = config.input_mode.split(":")
                input_def = ":".join(input_def)
                if input_def not in config.dataflow.definitions:
                    return web.json_response(
                        {
                            "error":
                            f"Missing definition for {input_def} in dataflow"
                        },
                        status=HTTPStatus.NOT_FOUND,
                    )

                if preprocess_mode == "json":
                    value = await request.json()
                elif preprocess_mode == "text":
                    value = await request.text()
                elif preprocess_mode == "bytes":
                    value = await request.read()
                elif preprocess_mode == "stream":
                    value = request.content
                else:
                    return web.json_response(
                        {
                            "error":
                            f"preprocess tag must be one of {self.IO_MODES}, got {preprocess_mode}"
                        },
                        status=HTTPStatus.NOT_FOUND,
                    )

                inputs.append(
                    MemoryInputSet(
                        MemoryInputSetConfig(
                            ctx=StringInputSetContext("post_input"),
                            inputs=[
                                Input(
                                    value=value,
                                    definition=config.dataflow.
                                    definitions[input_def],
                                )
                            ] + ([
                                Input(
                                    value=request.headers,
                                    definition=config.dataflow.definitions[
                                        config.forward_headers],
                                )
                            ] if config.forward_headers else []),
                        )))

            else:
                raise NotImplementedError(
                    "Input modes other than default,preprocess:definition_name  not yet implemented"
                )

        # Run the operation in an orchestrator
        # TODO(dfass) Create the orchestrator on startup of the HTTP API itself
        async with MemoryOrchestrator() as orchestrator:
            # TODO(dfass) Create octx on dataflow registration
            async with orchestrator(config.dataflow) as octx:
                results = {
                    str(ctx): result
                    async for ctx, result in octx.run(*inputs)
                }

                if config.output_mode == "json":
                    return web.json_response(results)

                # content_info is a List[str] ([content_type,output_keys])
                # in case of stream,bytes and string in others
                postprocess_mode, *content_info = config.output_mode.split(":")

                if postprocess_mode == "stream":
                    # stream:text/plain:get_single.beef
                    raise NotImplementedError(
                        "output mode  not yet implemented")

                elif postprocess_mode == "bytes":
                    content_type, output_keys = content_info
                    output_data = traverse_get(results, output_keys)
                    return web.Response(body=output_data)

                elif postprocess_mode == "text":
                    output_data = traverse_get(results, content_info[0])
                    return web.Response(text=output_data)

                else:
                    return web.json_response(
                        {"error": f"output mode not valid"},
                        status=HTTPStatus.NOT_FOUND,
                    )
예제 #25
0
HELLO_BLANK_DATAFLOW = DataFlow(
    operations={
        "hello_blank": formatter.op,
        "remap_to_response": remap.op
    },
    configs={
        "hello_blank": {
            "formatting": "Hello {}"
        },
        "remap_to_response": {
            "dataflow":
            DataFlow(
                operations={"get_formatted_message": GetSingle.op},
                seed=[
                    Input(
                        value=[formatter.op.outputs["string"].name],
                        definition=GetSingle.op.inputs["spec"],
                    )
                ],
            )
        },
    },
    seed=[
        Input(
            value={"response": [formatter.op.outputs["string"].name]},
            definition=remap.op.inputs["spec"],
        )
    ],
)

HELLO_WORLD_DATAFLOW = copy.deepcopy(HELLO_BLANK_DATAFLOW)
HELLO_WORLD_DATAFLOW.seed.append(
예제 #26
0
                }],
            }),
        "predict_using_model":
        InputFlow(inputs={"features": [{
            "create_feature_map": "mapping"
        }]}),
        "print_predictions":
        InputFlow(inputs={"data": [{
            "predict_using_model": "prediction"
        }]}),
    },
)
dataflow.seed.append(
    Input(
        value="Years",
        definition=create_mapping.op.inputs["key"],
        origin="seed.Years",
    ))


async def main():
    # train the model
    await train(
        slr_model,
        {
            "Years": 0,
            "Salary": 10
        },
        {
            "Years": 1,
            "Salary": 20
예제 #27
0
        "edit_feature": InputFlow(
            inputs={
                "features": [
                    {"seed": ["Years", "Expertise", "Trust", "Salary"]}
                ]
            },
        ),
        "associate_definition": InputFlow(inputs={"spec": ["seed"]}),
    },
)
TEST_DATAFLOW1.seed = [
    # I don't think we need this as we are providing the flow
    Input(
        value={
            feature.name: edit_feature.op.outputs["updated_features"].name
            for feature in TEST_FEATURE
        },
        definition=AssociateDefinition.op.inputs["spec"],
    )
]


class TestDataFlowSource(AsyncTestCase):
    @classmethod
    def setUpClass(self):
        self.records = [
            Record(
                str(i),
                data={
                    "features": {
                        "Years": A[i],
예제 #28
0
    async def test_run(self):
        linker = Linker()
        exported = linker.export(*OPERATIONS)
        definitions, operations, _outputs = linker.resolve(exported)

        # Instantiate inputs
        repos = glob.glob(
            os.path.join(
                os.path.expanduser("~"),
                "Documents",
                "python",
                "testrepos",
                "*",
            )
        )
        if not repos:
            repos = glob.glob(
                os.path.join(
                    os.path.expanduser("~"), "Documents", "python", "dffml"
                )
            )
        if not repos:
            repos = [
                "https://github.com/intel/dffml",
                "https://github.com/pdxjohnny/dffml",
            ]
        repos = repos[:1]
        urls = [
            Input(value=URL, definition=definitions["URL"], parents=None)
            for URL in repos
        ]
        no_git_branch_given = Input(
            value=True,
            definition=definitions["no_git_branch_given"],
            parents=None,
        )
        date_spec = Input(
            value=datetime.now().strftime(TIME_FORMAT_MINTUE_RESOLUTION),
            definition=definitions["quarter_start_date"],
            parents=None,
        )
        quarters = [
            Input(value=i, definition=definitions["quarter"], parents=None)
            for i in range(0, 10)
        ]

        group_by_spec = Input(
            value={
                "cloc": {
                    "group": "quarter",
                    "by": "language_to_comment_ratio",
                    "fill": 0,
                },
                "authors": {
                    "group": "quarter",
                    "by": "author_count",
                    "fill": 0,
                },
                "work": {"group": "quarter", "by": "work_spread", "fill": 0},
                "release": {
                    "group": "quarter",
                    "by": "release_within_period",
                    "fill": False,
                },
                "commits": {
                    "group": "quarter",
                    "by": "commit_count",
                    "fill": 0,
                },
            },
            definition=definitions["group_by_spec"],
            parents=None,
        )

        # Orchestrate the running of these operations
        help(MemoryOrchestrator.basic_config)
        async with MemoryOrchestrator.basic_config(*OPIMPS) as orchestrator:
            async with orchestrator() as octx:
                # Add our inputs to the input network with the context being the URL
                for url in urls:
                    await octx.ictx.sadd(
                        url.value,
                        url,
                        no_git_branch_given,
                        date_spec,
                        group_by_spec,
                        *quarters,
                    )
                async for ctx, results in octx.run_operations():
                    self.assertTrue(results)
예제 #29
0
async def run_dataflow(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
    """
    Starts a subflow ``self.config.dataflow`` and adds ``inputs`` in it.

    Parameters
    ----------
    inputs : dict
        The inputs to add to the subflow. These should be a key value mapping of
        the context string to the inputs which should be seeded for that context
        string.

    Returns
    -------
    dict
        Maps context strings in inputs to output after running through dataflow.

    Examples
    --------

    >>> URL = Definition(name="URL", primitive="string")
    >>>
    >>> subflow = DataFlow.auto(GetSingle)
    >>> subflow.definitions[URL.name] = URL
    >>> subflow.seed.append(
    ...     Input(
    ...         value=[URL.name],
    ...         definition=GetSingle.op.inputs["spec"]
    ...     )
    ... )
    >>>
    >>> dataflow = DataFlow.auto(run_dataflow, GetSingle)
    >>> dataflow.configs[run_dataflow.imp.op.name] = RunDataFlowConfig(subflow)
    >>> dataflow.seed.append(
    ...     Input(
    ...         value=[run_dataflow.imp.op.outputs["results"].name],
    ...         definition=GetSingle.op.inputs["spec"]
    ...     )
    ... )
    >>>
    >>> async def main():
    ...     async for ctx, results in MemoryOrchestrator.run(dataflow, {
    ...         "run_subflow": [
    ...             Input(
    ...                 value={
    ...                     "dffml": [
    ...                         {
    ...                             "value": "https://github.com/intel/dffml",
    ...                             "definition": URL.name
    ...                         }
    ...                     ]
    ...                 },
    ...                 definition=run_dataflow.imp.op.inputs["inputs"]
    ...             )
    ...         ]
    ...     }):
    ...         print(results)
    >>>
    >>> asyncio.run(main())
    {'flow_results': {'dffml': {'URL': 'https://github.com/intel/dffml'}}}
    """
    inputs_created = {}
    definitions = self.config.dataflow.definitions

    for ctx_str, val_defs in inputs.items():
        inputs_created[ctx_str] = [
            Input(
                value=val_def["value"],
                definition=definitions[val_def["definition"]],
            ) for val_def in val_defs
        ]
    async with self.subflow(self.config.dataflow) as octx:
        results = [{
            (await ctx.handle()).as_string(): result
        } async for ctx, result in octx.run(inputs_created)]

    return {"results": results}
예제 #30
0
    async def test_run(self):
        test_dataflow = DataFlow(
            operations={
                "run_dataflow": run_dataflow.op,
                "get_single": GetSingle.imp.op,
            },
            configs={"run_dataflow": RunDataFlowConfig(dataflow=DATAFLOW)},
            seed=[
                Input(
                    value=[run_dataflow.op.outputs["results"].name],
                    definition=GetSingle.op.inputs["spec"],
                )
            ],
        )

        test_inputs = [
            {
                "add_op": [
                    {
                        "value": "add 40 and 2",
                        "definition": parse_line.op.inputs["line"].name,
                    },
                    {
                        "value": [add.op.outputs["sum"].name],
                        "definition": GetSingle.op.inputs["spec"].name,
                    },
                ]
            },
            {
                "mult_op": [
                    {
                        "value": "multiply 42 and 10",
                        "definition": parse_line.op.inputs["line"].name,
                    },
                    {
                        "value": [mult.op.outputs["product"].name],
                        "definition": GetSingle.op.inputs["spec"].name,
                    },
                ]
            },
        ]
        test_outputs = {"add_op": 42, "mult_op": 420}

        async with MemoryOrchestrator.withconfig({}) as orchestrator:
            async with orchestrator(test_dataflow) as octx:
                async for _ctx, results in octx.run({
                        list(test_input.keys())[0]: [
                            Input(
                                value=test_input,
                                definition=run_dataflow.op.inputs["inputs"],
                            )
                        ]
                        for test_input in test_inputs
                }):
                    ctx_str = (await _ctx.handle()).as_string()
                    self.assertIn("flow_results", results)

                    results = results["flow_results"]
                    self.assertIn(ctx_str, map(str, results.keys()))
                    self.assertIn(ctx_str, test_outputs)

                    results = results[list(results.keys())[0]]
                    self.assertIn("result", results)

                    results = results["result"]
                    expected_results = test_outputs[ctx_str]
                    self.assertEqual(expected_results, results)