async def test_get_embedding(self): input_sentence = ( "The end is the beginning , and the beginning is the end") async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(get_embedding, GetSingle), [ Input( value=[get_embedding.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_sentence, definition=get_embedding.op.inputs["text"], ), Input( value="en_core_web_sm", definition=get_embedding.op.inputs["spacy_model"], ), ], ): embeddings = results[get_embedding.op.outputs["result"].name] self.assertEqual(len(input_sentence.split()), len(embeddings)) self.assertEqual( embeddings[randint(0, len(input_sentence.split()) - 1)].shape, embeddings[randint(0, len(input_sentence.split()) - 1)].shape, )
async def test_convert_color(self): async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(convert_color, GetSingle), [ Input( value=[ convert_color.op.outputs["result"].name, ], definition=GetSingle.op.inputs["spec"], ), Input( value=self.INPUT_ARRAY, definition=convert_color.op.inputs["src"], ), Input( value="BGR2RGB", definition=convert_color.op.inputs["code"], ), ], ): self.assertEqual( cv2.cvtColor( results[convert_color.op.outputs["result"].name], cv2.COLOR_RGB2BGR, ).flatten().tolist(), self.INPUT_ARRAY.flatten().tolist(), )
async def test_calcHist(self): async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(calcHist, GetSingle), [ Input( value=[ calcHist.op.outputs["result"].name, ], definition=GetSingle.op.inputs["spec"], ), Input( value=self.INPUT_ARRAY, definition=calcHist.op.inputs["images"], ), Input( value=None, definition=calcHist.op.inputs["mask"], ), Input( value=[0, 1], definition=calcHist.op.inputs["channels"], ), Input( value=[32, 32], definition=calcHist.op.inputs["histSize"], ), Input( value=[0, 256, 0, 256], definition=calcHist.op.inputs["ranges"], ), ], ): self.assertEqual(results[calcHist.op.outputs["result"].name].shape, (32, 32))
async def test_0_create(self): df = self._create_dataflow_with_op(db_query_create_table) test_inputs = { "create": { "table_name": self.table_name, "cols": self.cols } } async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(df) as octx: async for _ctx, results in octx.run({ test_ctx: [ Input( value=val, definition=db_query_create_table.op.inputs[key], ) for key, val in test_val.items() ] for test_ctx, test_val in test_inputs.items() }): pass async with self.sdb as db: async with db() as db_ctx: query = ( "SELECT count(name) FROM sqlite_master " + f" WHERE type='table' and name='{self.table_name}' ") db_ctx.parent.cursor.execute(query) results = db_ctx.parent.cursor.fetchone() self.assertEqual(results["count(name)"], 1)
async def test_2_lookup(self): seed = [ Input( value=[db_query_lookup.op.outputs["lookups"].name], definition=GetSingle.op.inputs["spec"], ) ] df = self._create_dataflow_with_op(db_query_lookup, seed=seed) test_inputs = { "lookup": { "table_name": self.table_name, "cols": [], "conditions": [], } } async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(df) as octx: async for _ctx, results in octx.run({ test_ctx: [ Input( value=val, definition=db_query_lookup.op.inputs[key], ) for key, val in test_val.items() ] for test_ctx, test_val in test_inputs.items() }): self.assertIn("query_lookups", results) results = results["query_lookups"] self.assertEqual(self.data_dicts, results)
async def test_1_insert(self): df = self._create_dataflow_with_op(db_query_insert) for _data in self.data_dicts: test_inputs = { "insert": { "table_name": self.table_name, "data": _data } } async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(df) as octx: async for _ctx, results in octx.run({ test_ctx: [ Input( value=val, definition=db_query_insert.op.inputs[key], ) for key, val in test_val.items() ] for test_ctx, test_val in test_inputs.items() }): continue async with self.sdb as db: async with db() as db_ctx: query = f"SELECT * FROM {self.table_name}" db_ctx.parent.cursor.execute(query) rows = db_ctx.parent.cursor.fetchall() self.assertEqual(self.data_dicts, list(map(dict, rows)))
async def test_associatedefinition(self): feed_def = Definition(name="feed", primitive="string") dead_def = Definition(name="dead", primitive="string") output = Definition(name="output", primitive="string") feed_input = Input(value="my favorite value", definition=feed_def) face_input = Input( value="face", definition=output, parents=[feed_input] ) dead_input = Input( value="my second favorite value", definition=dead_def ) beef_input = Input( value="beef", definition=output, parents=[dead_input] ) test_result = {"feed": "face", "dead": "beef"} for test_value in test_result.keys(): async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(AssociateDefinition), [ feed_input, face_input, dead_input, beef_input, Input( value={test_value: "output"}, definition=AssociateDefinition.op.inputs["spec"], ), ], ): self.assertEqual( results, {test_value: test_result[test_value]} )
async def test_vaildation_by_op(self): test_dataflow = DataFlow( operations={ "validate_shout_instance": validate_shouts.op, "echo_shout": echo_shout.op, "get_single": GetSingle.imp.op, }, seed=[ Input( value=[echo_shout.op.outputs["shout_out"].name], definition=GetSingle.op.inputs["spec"], ) ], implementations={ validate_shouts.op.name: validate_shouts.imp, echo_shout.op.name: echo_shout.imp, }, ) test_inputs = { "TestShoutOut": [Input(value="validation_status:", definition=SHOUTIN)] } async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(test_dataflow) as octx: async for ctx_str, results in octx.run(test_inputs): self.assertIn("shout_out", results) self.assertEqual(results["shout_out"], "validation_status:_validated")
async def test_0_start_container(self): with mock.patch.object( clone_git_repo.imp, "CONTEXT", new=FakeCloneRepoImp ): tag = f"{USER}/{REPO}" before = await check_output( "docker", "ps", "--filter", f"ancestor={tag}", "--format", "{{.ID}} {{.RunningFor}}", ) async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(self.dataflow) as octx: async for ctx, results in octx.run(self.test_inputs): after = await check_output( "docker", "ps", "--filter", f"ancestor={tag}", "--format", "{{.ID}} {{.RunningFor}}", ) self.assertNotEqual(before, after) self.assertIn("docker_restarted_containers", results) self.containers_to_remove = results[ "docker_restarted_containers" ]
async def test_pos_tagger(self): input_sentence = ( "The end is the beginning , and the beginning is the end" ) async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(pos_tagger, GetSingle), [ Input( value=[pos_tagger.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_sentence, definition=pos_tagger.op.inputs["text"], ), Input( value="en_core_web_sm", definition=pos_tagger.op.inputs["spacy_model"], ), ], ): pos_tags = results[pos_tagger.op.outputs["result"].name] words = input_sentence.split() for i, _ in enumerate(words): self.assertEqual(pos_tags[i][0], words[i]) self.assertIn(pos_tags[i][1], ["DT", "NN", "VBZ", "CC", ","])
async def test_run(self): dataflow = DataFlow.auto(convert_to_gif, GetSingle) dataflow.seed.append( Input( value=[convert_to_gif.op.outputs["output_file"].name], definition=GetSingle.op.inputs["spec"], ) ) input_file_path = self.parent_path / "input.mp4" with open(input_file_path, "rb") as f: input_file = f.read(-1) test_inputs = { "Test": [ Input( value=input_file, definition=convert_to_gif.op.inputs["input_file"], ), Input( value=240, definition=convert_to_gif.op.inputs["resolution"], ), ] } async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(dataflow) as octx: async for ctx, results in octx.run(test_inputs): self.assertIn("output_file", results) output = results["output_file"] self.assertGreater(len(output), 100000)
async def test_get_similarity(self): input_sentence1 = ( "The end is the beginning , and the beginning is the end" ) input_sentence2 = ( "The end was the beginning , and the beginning was the end" ) async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(get_similarity, GetSingle), [ Input( value=[get_similarity.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_sentence1, definition=get_similarity.op.inputs["text_1"], ), Input( value=input_sentence2, definition=get_similarity.op.inputs["text_2"], ), Input( value="en_core_web_sm", definition=get_similarity.op.inputs["spacy_model"], ), ], ): similarity_score = results[ get_similarity.op.outputs["result"].name ] self.assertGreater(similarity_score, 0.9)
async def test_run(self): dataflow = DataFlow.auto(*OPIMPS) passwords = [str(random.random()) for _ in range(0, 20)] # Orchestrate the running of these operations async with MemoryOrchestrator.withconfig({}) as orchestrator: definitions = Operation.definitions(*OPERATIONS) passwords = [ Input( value=password, definition=definitions["UnhashedPassword"], parents=None, ) for password in passwords ] output_spec = Input( value=["ScryptPassword"], definition=definitions["get_single_spec"], parents=None, ) async with orchestrator(dataflow) as octx: try: async for _ctx, results in octx.run({ password.value: [password, output_spec] for password in passwords }): self.assertTrue(results) except AttributeError as error: raise
async def test_run(self): calc_strings_check = {"add 40 and 2": 42, "multiply 42 and 10": 420} async with MemoryOrchestrator.basic_config(*OPIMPS) as orchestrator: async with orchestrator() as octx: for to_calc in calc_strings_check.keys(): await octx.ictx.sadd( to_calc, Input( value=to_calc, definition=calc_parse_line.op.inputs["line"], ), Input( value=[calc_add.op.outputs["sum"].name], definition=GetSingle.op.inputs["spec"], ), ) async for ctx, results in octx.run_operations(): ctx_str = (await ctx.handle()).as_string() self.assertEqual( calc_strings_check[ctx_str], results[GetSingle.op.name][ calc_add.op.outputs["sum"].name ], )
async def test_principal_component_analysis(self): input_data, _ = make_classification( n_samples=10, n_features=10, n_informative=8, n_redundant=2, random_state=7, ) async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(principal_component_analysis, GetSingle), [ Input( value=[ principal_component_analysis.op.outputs["result"].name ], definition=GetSingle.op.inputs["spec"], ), Input( value=input_data, definition=principal_component_analysis.op.inputs["data"], ), Input( value=8, definition=principal_component_analysis.op. inputs["n_components"], ), ], ): self.assertTrue((10, 8) == results[ principal_component_analysis.op.outputs["result"].name].shape)
async def operation_db(): """ Create the database and table (myTable) for the db operations """ sdb = SqliteDatabase(SqliteDatabaseConfig(filename="examples.db")) dataflow = DataFlow( operations={"db_query_create": db_query_create_table.op}, configs={"db_query_create": DatabaseQueryConfig(database=sdb)}, seed=[], ) inputs = [ Input( value="myTable", definition=db_query_create_table.op.inputs["table_name"], ), Input( value={ "key": "INTEGER NOT NULL PRIMARY KEY", "firstName": "text", "lastName": "text", "age": "int", }, definition=db_query_create_table.op.inputs["cols"], ), ] async for ctx, result in MemoryOrchestrator.run(dataflow, inputs): pass
async def test_auto_start(self): test_inputs = {"testStart": []} async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(self.dataflow) as octx: async for ctx_str, results in octx.run(test_inputs): self.assertIn("string", results) self.assertEqual("EXISTS", results["string"])
async def test_run(self): passwords = [str(random.random()) for _ in range(0, 20)] # Orchestrate the running of these operations async with MemoryOrchestrator.basic_config(*OPIMPS) as orchestrator: definitions = Operation.definitions(*OPERATIONS) passwords = [ Input(value=password, definition=definitions['UnhashedPassword'], parents=None) for password in passwords ] output_spec = Input(value=['ScryptPassword'], definition=definitions['get_single_spec'], parents=None) async with orchestrator() as octx: # Add our inputs to the input network with the context being the URL for password in passwords: await octx.ictx.add( MemoryInputSet( MemoryInputSetConfig( ctx=StringInputSetContext(password.value), inputs=[password, output_spec]))) try: async for _ctx, results in octx.run_operations( strict=True): self.assertTrue(results) except AttributeError as error: if "module 'hashlib' has no attribute 'scrypt'" \ in str(error): return raise
async def test_simple_imputer(self): input_data = [[np.nan, 2], [6, np.nan], [7, 6]] output_data = [[6.5, 2], [6, 4], [7, 6]] async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(simple_imputer, GetSingle), [ Input( value=[simple_imputer.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_data, definition=simple_imputer.op.inputs["data"], ), Input( value=np.nan, definition=simple_imputer.op.inputs["missing_values"], ), Input( value="mean", definition=simple_imputer.op.inputs["strategy"], ), ], ): self.assertTrue((results[simple_imputer.op.outputs["result"].name] == output_data).all())
class DataFlowSourceConfig: source: BaseSource = field("Source to wrap") dataflow: DataFlow = field("DataFlow to use for preprocessing") features: Features = field( "Features to pass as definitions to each context from each " "record to be preprocessed", default=Features(), ) inputs: List[str] = field( "Other inputs to add under each ctx (record's key will " + "be used as the context)", action=ParseInputsAction, default_factory=lambda: [], ) record_def: str = field( "Definition to be used for record.key." "If set, record.key will be added to the set of inputs " "under each context (which is also the record's key)", default=None, ) length: str = field("Definition name to add as source length", default=None) all_for_single: bool = field( "Run all records through dataflow before grabing " "results of desired record on a call to record()", default=False, ) no_strict: bool = field( "Do not exit on operation exceptions, just log errors", default=False, ) orchestrator: BaseOrchestrator = MemoryOrchestrator.withconfig({})
async def test_AcceptUserInput(self): test_inputs = {"testInput": []} async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(self.InputDataflow) as octx: with mock.patch("builtins.input", return_value="Testing AcceptUserInput"): async for ctx_str, results in octx.run(test_inputs): self.assertIn("UserInput", results) self.assertEqual("Testing AcceptUserInput", results["UserInput"])
async def multicomm_dataflow(self, config, request): # Seed the network with inputs given by caller # TODO(p0,security) allowlist of valid definitions to seed (set # Input.origin to something other than seed) inputs = [] # If data was sent add those inputs if request.method == "POST": # Accept a list of input data # TODO validate that input data is dict of list of inputs each item # has definition and value properties for ctx, client_inputs in (await request.json()).items(): for input_data in client_inputs: if ( not input_data["definition"] in config.dataflow.definitions ): return web.json_response( { "error": f"Missing definition for {input_data['definition']} in dataflow" }, status=HTTPStatus.NOT_FOUND, ) inputs.append( MemoryInputSet( MemoryInputSetConfig( ctx=StringInputSetContext(ctx), inputs=[ Input( value=input_data["value"], definition=config.dataflow.definitions[ input_data["definition"] ], ) for input_data in client_inputs ], ) ) ) # Run the operation in an orchestrator # TODO(dfass) Create the orchestrator on startup of the HTTP API itself async with MemoryOrchestrator.basic_config() as orchestrator: # TODO(dfass) Create octx on dataflow registration async with orchestrator(config.dataflow) as octx: results = { str(ctx): result async for ctx, result in octx.run(*inputs) } # TODO Implement input and presentation stages? """ if config.presentation == "blob": return web.Response(body=results) elif config.presentation == "text": return web.Response(text=results) else: """ return web.json_response(results)
async def test_condition_does_not_run_auto_start(self): ran = [] @op(conditions=[CONDITION]) async def condition_test(): ran.append(True) # pragma: no cover async with MemoryOrchestrator() as orchestrator: async with orchestrator(DataFlow(condition_test)) as octx: async for _ in octx.run([]): pass self.assertFalse(ran)
async def test_print_output(self): test_inputs = [ Input( value="Testing print_output", definition=self.OutputDataflow.definitions["DataToPrint"], parents=None, ) ] async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(self.OutputDataflow) as octx: with contextlib.redirect_stdout(self.stdout): async for ctx_str, _ in octx.run(test_inputs): results = self.stdout.getvalue() self.assertIn("Testing print_output", results)
async def run(self): # Create an Orchestrator which will manage the running of our operations async with MemoryOrchestrator.basic_config(*OPIMPS) as orchestrator: # Create a orchestrator context, everything in DFFML follows this # one-two context entry pattern async with orchestrator() as octx: for package_name in self.packages: # For each package add a new input set to the network of # inputs (ictx). Operations run under a context, the context # here is the package_name to evaluate (the first argument). # The next arguments are all the inputs we're seeding the # network with for that context. We give the package name # because pypi_latest_package_version needs it to find the # version, which safety will then use. We also give an input # to the output operation GetSingle, which takes a list of # data type definitions we want to select as our results. await octx.ictx.sadd( package_name, Input( value=package_name, definition=pypi_package_json.op.inputs["package"], ), Input( value=[ safety_check.op.outputs["issues"].name, run_bandit.op.outputs["report"].name, ], definition=GetSingle.op.inputs["spec"], ), ) # Run all the operations, Each iteration of this loop happens # when all inputs are exhausted for a context, the output # operations are then run and their results are yielded async for ctx, results in octx.run_operations(): # The context for this data flow was the package name package_name = (await ctx.handle()).as_string() # Get the results of the GetSingle output operation results = results[GetSingle.op.name] # Check if any of the values of the operations evaluate to # true, so if the number of issues found by safety is # non-zero then this will be true any_issues = list(results.values()) if (any_issues[0] > 0 or any_issues[1]["CONFIDENCE.HIGH_AND_SEVERITY.HIGH"] > 5): print(f"Do not install {package_name}! {results!r}") else: print(f"{package_name} is okay to install")
async def records(self) -> AsyncIterator[Record]: async for record in self.sctx.records(): async for ctx, result in MemoryOrchestrator.run( self.parent.config.dataflow, [ Input( value=record.feature(feature.name), definition=Definition(name=feature.name, primitive=str(feature.dtype())), ) for feature in self.parent.config.features ], ): if result: record.evaluated(result) yield record
async def test_validate(self): test_inputs = { "area": [ Input(value="unitcircle", definition=ShapeName), Input(value=1, definition=Radius), Input(value=3.14, definition=Pie), ] } async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(self.dataflow) as octx: async for ctx_str, results in octx.run(test_inputs): self.assertIn("mapping", results) results = results["mapping"] self.assertEqual(results["name"], "UNITCIRCLE") self.assertEqual(results["area"], 3.14) self.assertEqual(results["radius"], 1)
async def test_gen_with_input(self): test_dataflow = DataFlow.auto(GetMulti, counter, echo_num) test_dataflow.seed.append( Input( value=[echo_num.op.outputs["number_out"].name], definition=GetMulti.op.inputs["spec"], )) test_dataflow.implementations[counter.op.name] = counter.imp test_dataflow.implementations[echo_num.op.name] = echo_num.imp test_inputs = {"TestCount": [Input(value=1, definition=CountStart)]} async with MemoryOrchestrator.withconfig({}) as orchestrator: async with orchestrator(test_dataflow) as octx: async for ctx_str, results in octx.run(test_inputs): self.assertIn("number", results) self.assertEqual(set([1, 2, 3, 4, 5]), set(results["number"]))
async def test_condition_does_not_run(self): ran = [] @op(conditions=[CONDITION]) async def condition_test(hi: str): ran.append(True) async with MemoryOrchestrator() as orchestrator: async with orchestrator(DataFlow(condition_test)) as octx: async for _ in octx.run([ Input( value=True, definition=condition_test.op.inputs["hi"], ), ]): pass self.assertFalse(ran)
async def test_standard_scaler(self): input_data = [[0, 0], [0, 0], [1, 1], [1, 1]] output_data = [[-1, -1], [-1, -1], [1, 1], [1, 1]] async for ctx, results in MemoryOrchestrator.run( DataFlow.auto(standard_scaler, GetSingle), [ Input( value=[standard_scaler.op.outputs["result"].name], definition=GetSingle.op.inputs["spec"], ), Input( value=input_data, definition=standard_scaler.op.inputs["data"], ), ], ): self.assertTrue((results[standard_scaler.op.outputs["result"].name] == output_data))