def test_scale(self): data = {'col1': [1.0, 2.0, 3.0], 'col2': [4.0, 5.0, 6.0]} # # No model # X = pd.DataFrame(data) y = transform(resolve_full_name('sklearn.preprocessing:scale'), 'all', X['col1'], None, {}, None) # It returns ndarray self.assertEqual(len(y), 3) self.assertAlmostEqual(y.mean(), 0.0) self.assertAlmostEqual(y.std(ddof=0), 1.0) # # Use some model, 2 input columns and one output column (so second input column will be overwritten) # X = pd.DataFrame(data) model = {'with_mean': True, 'with_std': False} y = transform(resolve_full_name('sklearn.preprocessing:scale'), 'all', X['col2'], None, model, None) self.assertAlmostEqual(y.mean(), 0.0) self.assertAlmostEqual(y.std(ddof=0), 0.816496580927726)
def test_UDF(self): data = {'col1': [1.0, 2.0, 3.0], 'col2': [4.0, 5.0, 6.0]} # # No model. Single input # X = pd.DataFrame(data) out = transform(resolve_full_name('test_transform:udf1'), 'one', X[['col2']], None, {}, None) self.assertEqual(len(out), 3) self.assertAlmostEqual(out[0], 5.0) self.assertAlmostEqual(out[1], 6.0) self.assertAlmostEqual(out[2], 7.0) # # Has model. Parameters flattened. # model = {'addition': 1.0} X = pd.DataFrame(data) out = transform(resolve_full_name('test_transform:udf2'), 'one', X['col2'], None, model, None) self.assertEqual(len(out), 3) self.assertAlmostEqual(out[0], 5.0) self.assertAlmostEqual(out[1], 6.0) self.assertAlmostEqual(out[2], 7.0) # # No model. Row input # X = pd.DataFrame(data) out = transform(resolve_full_name('test_transform:udf3'), 'one', X[['col1', 'col2']], None, {}, None) self.assertEqual(len(out), 3) self.assertAlmostEqual(out[0], 5.0) self.assertAlmostEqual(out[1], 7.0) self.assertAlmostEqual(out[2], 9.0) # # Has model. Row input # model = {'addition': 1.0} X = pd.DataFrame(data) out = transform(resolve_full_name('test_transform:udf4'), 'one', X[['col1', 'col2']], None, model, None) self.assertEqual(len(out), 3) self.assertAlmostEqual(out[0], 6.0) self.assertAlmostEqual(out[1], 8.0) self.assertAlmostEqual(out[2], 10.0)
def evaluate(self): """ Evaluate this column. """ log.info(" ===> Start evaluating column '{0}'".format(self.id)) # # Stage 1: Ensure that "data" field is ready for applying column operations # table = self.table.data # Table the columns will be added to # # Stage 2: Generate a list of concrete definitions by imposing extensions on the base definition # "extensions" field determine family or not. # concrete_definitions = self.get_definitions() num_extensions = len(concrete_definitions) for i, definition in enumerate(concrete_definitions): # # Stage 3. Resolve the function # func_name = definition.get('function') func = resolve_full_name(func_name) if not func: log.warning( "Cannot resolve user-defined function '{0}'. Skip column definition." .format(func_name)) break scope = definition.get('scope') # # Stage 4. Prepare input data argument to pass to the function (as the first argument) # data = table inputs = definition.get('inputs') if inputs is None: inputs = [] inputs = get_columns(inputs, data) if inputs is None: log.warning( "Error reading column list. Skip column definition.") break # Validation: check if all explicitly specified columns available if not all_columns_exist(inputs, data): log.warning( "Not all columns available. Skip column definition.". format()) break # Select only specified columns data = data[inputs] data_type = definition.get('data_type') # # Stage 5. Prepare model object to pass to the function (as the second argument) # It can be necessary to instantiate the argument object by using the specified class # It can be necessary to generate (train) a model (we need some specific logic to determine such a need) # model_ref = definition.get('model') model_type = definition.get('model_type') if model_ref and isinstance(model_ref, str) and model_ref.startswith('$'): log.info("Load model from {0}.".format(model_ref)) model = get_value( model_ref ) # De-reference model which can be represented by-reference (if it is a string starting with $) else: model = model_ref train = definition.get('train') if not model and train: # 1. Resolve train function train_func_name = train.get('function') train_func = resolve_full_name(train_func_name) if not train_func: log.warning( "Cannot resolve user-defined training function '{0}'. Skip training." .format(train_func_name)) break # 2. Filter rows for train data train_table = table train_row_filter = train.get("row_filter") if train_row_filter: train_table = apply_row_filter(table, train_row_filter) # 3. Select columns to use for training train_data = train_table train_inputs = train.get('inputs') if train_inputs is None: train_inputs = inputs # Inherit from the 'apply' section train_inputs = get_columns(train_inputs, train_data) if train_inputs is None: log.warning( "Error reading column list for training. Skip column training." ) break # Validation: check if all explicitly specified columns available if not all_columns_exist(train_inputs, train_data): log.warning( "Not all columns available for training. Skip column definition." .format()) break # Select only specified columns train_data = train_data[train_inputs] # 3. Determine labels # - no labels at all (no argument is expected) - unsupervised learning # - explicitly specified outputs # - use output column specified in the transformation (but it has to be already available, e.g., loaded from source data, while the transformation will overwrite it) labels = train.get('outputs') if not labels: labels = definition.get( 'outputs' ) # Same columns as used by the transformation if labels: labels = get_columns(labels, table) if labels is None: log.warning( "Error reading column list. Skip column definition." ) break train_labels = train_table[ labels] # Select only specified columns else: train_labels = None # Do not pass any labels at all (unsupervised) # 4. Retrieve hyper-model train_model = train.get('model', {}) # Cast data argument if data_type == 'ndarray': data_arg = train_data.values if train_labels is not None: labels_arg = train_labels.values else: data_arg = train_data if train_labels is not None: labels_arg = train_labels # 5. Call the function and generate a model if train_labels is None: model = train_func(data_arg, **train_model) else: if train_model is None: model = train_func(data_arg, labels_arg) else: model = train_func(data_arg, labels_arg, **train_model) # 6. Each time a new model is generated, we store it in the model field of the definition if model and model_ref: log.info("Store trained model in {0}.".format(model_ref)) set_value(model_ref, model) elif not model and not train: model = {} # # Stage 6. Apply function. # Depending on the "scope" the system will organize a loop over records, windows or make single call # It also depends on the call options (how and what to pass in data and model arguments, flatten json, ndarry or Series etc.) # out = transform(func, scope, data, data_type, model, model_type) # # Stage 7. Post-process the result by renaming the output columns accordingly (some convention is needed to know what output to expect) # outputs = definition.get('outputs', []) if isinstance( outputs, str ): # If a single name is provided (not a list), then we wrap into a list outputs = [outputs] if not outputs: id = definition.get('id') # TODO: We could use a smarter logic here by finding a parameter of the extension which really changes (is overwritten): inputs, function, outputs, scope, model etc. if num_extensions > 1: id = id + '_' + str(i) outputs.append(id) # TODO: There result could be a complex object, while some option (like 'result_path') could provide a path to access it, so we need to be able to retrieve the result (either here or in transform function) # TODO: The result can be Series/listndarray(1d or 2d) and we need to convert it to DataFrame by using the original index. out = pd.DataFrame(out) # Result can be ndarray for i, c in enumerate(out.columns): if outputs and i < len( outputs): # Explicitly specified output column name n = outputs[i] else: # Same name - overwrite input column n = inputs[i] table[n] = out[ c] # A column is attached by matching indexes so indexes have to be consistent (the same) # # Stage 8. Post-process the whole family # log.info(" <=== Finish evaluating column '{0}'".format(self.id))
def evaluate(self): """ Evaluate this column. Evaluation logic depends on the operation (definition) kind. """ log.info(" ---> Start evaluating column '{0}'".format(self.id)) # # Stage 1: Ensure that the data field (with table data) is ready for applying column operations # table = self.table.data # Table the columns will be added to # # Stage 2: Generate a list of concrete definitions by imposing extensions on the base definition # "extensions" field determine family or not. # concrete_definitions = self.get_definitions() num_extensions = len(concrete_definitions) # Essentially, we evaluate several columns independently for i, definition in enumerate(concrete_definitions): window = definition.get('window') operation = definition.get('operation') if not operation: # Default if window is None or window == 'one' or window == '1': operation = 'calculate' # Default elif window == 'all': operation = 'all' else: operation = 'roll' # # Stage 3. Resolve the function # func_name = definition.get('function') if not func_name: log.warning("Column function is not specified. Skip column definition.".format(func_name)) break func = resolve_full_name(func_name) if not func: log.warning("Cannot resolve user-defined function '{0}'. Skip column definition.".format(func_name)) break # # Stage 4. Prepare input data argument to pass to the function (as the first argument) # data = table inputs = definition.get('inputs', []) inputs = get_columns(inputs, data) if inputs is None: log.warning("Error reading column list. Skip column definition.") break # Validation: check if all explicitly specified columns available if not all_columns_exist(inputs, data): log.warning("Not all columns available. Skip column definition.".format()) break # Select only the specified input columns data = data[inputs] data_type = definition.get('data_type') # # Stage 5. Prepare model object to pass to the function (as the second argument) # model_type = definition.get('model_type') model = self.prepare_model(definition, inputs) if model is None: break # # Stage 6. Apply function. # Depending on the "window" the system will organize a loop over records, windows or make single call # It also depends on the call options (how and what to pass in data and model arguments, flatten json, ndarry or Series etc.) # out = transform(func, window, data, data_type, model, model_type) # # Stage 7. Post-process the result by renaming the output columns accordingly (some convention is needed to know what output to expect) # outputs = definition.get('outputs', []) if isinstance(outputs, str): # If a single name is provided (not a list), then we wrap into a list outputs = [outputs] if not outputs: id = definition.get('id') # TODO: We could use a smarter logic here by finding a parameter of the extension which really changes (is overwritten): inputs, function, outputs, window, model etc. if num_extensions > 1: id = id + '_' + str(i) outputs.append(id) # TODO: There result could be a complex object, while some option (like 'result_path') could provide a path to access it, so we need to be able to retrieve the result (either here or in transform function) # TODO: The result can be Series/listndarray(1d or 2d) and we need to convert it to DataFrame by using the original index. out = pd.DataFrame(out) # Result can be ndarray for i, c in enumerate(out.columns): if outputs and i < len(outputs): # Explicitly specified output column name n = outputs[i] else: # Same name - overwrite input column n = inputs[i] table[n] = out[c] # A column is attached by matching indexes so indexes have to be consistent (the same) # # Stage 8. Post-process the whole family # log.info(" <--- Finish evaluating column '{0}'".format(self.id))