def test_pipeline_from_and_to_java_json(self): # json generated from Java api java_json = '[{"stageClassName":"org.apache.flink.ml.pipeline.' \ 'UserDefinedPipelineStages$SelectColumnTransformer",' \ '"stageJson":"{\\"selectedCols\\":\\"[\\\\\\"a\\\\\\",' \ '\\\\\\"b\\\\\\"]\\"}"}]' # load json p = Pipeline() p.load_json(java_json) python_json = p.to_json() t_env = MLEnvironmentFactory().get_default( ).get_stream_table_environment() table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) t_env.register_table_sink("TestJsonResults", table_sink) source_table = t_env.from_elements([(1, 2, 3, 4), (4, 3, 2, 1)], ['a', 'b', 'c', 'd']) transformer = p.get_stages()[0] exec_insert_table(transformer.transform(t_env, source_table), "TestJsonResults") actual = source_sink_utils.results() self.assert_equals(actual, ["1,2", "4,3"]) self.assertEqual(python_json, java_json)
def test_fit_table(self): id = MLEnvironmentFactory.get_new_ml_environment_id() env = MLEnvironmentFactory.get(id) table = env.get_stream_table_environment().from_elements([(1, 2, 3)]) estimator = self.create_pipeline_stage() estimator.fit(env.get_stream_table_environment(), table) self.assertTrue(estimator.fitted)
def test_java_transformer(self): t_env = MLEnvironmentFactory().get_default().get_stream_table_environment() table_sink = source_sink_utils.TestAppendSink( ['a', 'b'], [DataTypes.BIGINT(), DataTypes.BIGINT()]) t_env.register_table_sink("TransformerResults", table_sink) source_table = t_env.from_elements([(1, 2, 3, 4), (4, 3, 2, 1)], ['a', 'b', 'c', 'd']) transformer = WrapperTransformer(selected_cols=["a", "b"]) transformer.transform(t_env, source_table).execute_insert("TransformerResults").wait() actual = source_sink_utils.results() self.assert_equals(actual, ["1,2", "4,3"])
def test_pipeline(self): t_env = MLEnvironmentFactory().get_default( ).get_stream_table_environment() train_table = t_env.from_elements([(1, 2), (1, 4), (1, 0), (10, 2), (10, 4), (10, 0)], ['a', 'b']) serving_table = t_env.from_elements([(0, 0), (12, 3)], ['a', 'b']) table_sink = source_sink_utils.TestAppendSink(['predict_result'], [DataTypes.BOOLEAN()]) t_env.register_table_sink("PredictResults", table_sink) # transformer, output features column which is the sum of a and b. transformer = PythonAddTransformer(selected_cols=["a", "b"], output_col="features") # estimator estimator = PythonEstimator()\ .set_vector_col("features")\ .set_prediction_col("predict_result") # pipeline pipeline = Pipeline().append_stage(transformer).append_stage(estimator) pipeline\ .fit(t_env, train_table)\ .transform(t_env, serving_table)\ .insert_into('PredictResults') # execute t_env.execute('PipelineITCase') actual = source_sink_utils.results() # the first input is false since 0 + 0 is smaller than the max_sum 14. # the second input is true since 12 + 3 is bigger than the max_sum 14. self.assert_equals(actual, ["false", "true"])