def train(self, df, targets, conditions): # Obtain the targets column. if len(targets) != 1: raise BLE( ValueError('MultipleRegression requires at least one column ' 'in targets. Received {}'.format(targets))) if targets[0][1].lower() != 'numerical': raise BLE( ValueError('MultipleRegression can only regress NUMERICAL ' 'columns. Received {}'.format(targets))) self.targets = [targets[0][0]] # Obtain the condition columns. if len(conditions) < 1: raise BLE( ValueError( 'MultipleRegression requires at least one ' 'column in conditions. Received {}'.format(conditions))) self.conditions_categorical = [] self.conditions_numerical = [] for c in conditions: if c[1].lower() == 'categorical': self.conditions_categorical.append(c[0]) else: self.conditions_numerical.append(c[0]) self.conditions = self.conditions_numerical + \ self.conditions_categorical # The dataset. self.dataset = pd.DataFrame() # Lookup for categoricals to code. self.categories_to_val_map = dict() # Training set (regressors and labels) self.X_numerical = np.ndarray(0) self.X_categorical = np.ndarray(0) self.Y = np.ndarray(0) # Linear regressors. self.mr_partial = LinearRegression() self.mr_full = LinearRegression() # Preprocess the data. self.dataset = utils.extract_sklearn_dataset(self.conditions, self.targets, df) self.categories_to_val_map = utils.build_categorical_to_value_map( self.conditions_categorical, self.dataset) self.X_categorical = utils.extract_sklearn_features_categorical( self.conditions_categorical, self.categories_to_val_map, self.dataset) self.X_numerical = utils.extract_sklearn_features_numerical( self.conditions_numerical, self.dataset) self.Y = utils.extract_sklearn_univariate_target( self.targets, self.dataset) # Train the multiple regression. self._train_mr()
def train(self, df, targets, conditions): # Obtain the targets column. if len(targets) != 1: raise BLE(ValueError( 'MultipleRegression requires at least one column ' 'in targets. Received {}'.format(targets))) if targets[0][1].lower() != 'numerical': raise BLE(ValueError( 'MultipleRegression can only regress NUMERICAL ' 'columns. Received {}'.format(targets))) self.targets = [targets[0][0]] # Obtain the condition columns. if len(conditions) < 1: raise BLE(ValueError('MultipleRegression requires at least one ' 'column in conditions. Received {}'.format(conditions))) self.conditions_categorical = [] self.conditions_numerical = [] for c in conditions: if c[1].lower() == 'categorical': self.conditions_categorical.append(c[0]) else: self.conditions_numerical.append(c[0]) self.conditions = self.conditions_numerical + \ self.conditions_categorical # The dataset. self.dataset = pd.DataFrame() # Lookup for categoricals to code. self.categories_to_val_map = dict() # Training set (regressors and labels) self.X_numerical = np.ndarray(0) self.X_categorical = np.ndarray(0) self.Y = np.ndarray(0) # Linear regressors. self.mr_partial = LinearRegression() self.mr_full = LinearRegression() # Preprocess the data. self.dataset = utils.extract_sklearn_dataset(self.conditions, self.targets, df) self.categories_to_val_map = utils.build_categorical_to_value_map( self.conditions_categorical, self.dataset) self.X_categorical = utils.extract_sklearn_features_categorical( self.conditions_categorical, self.categories_to_val_map, self.dataset) self.X_numerical = utils.extract_sklearn_features_numerical( self.conditions_numerical, self.dataset) self.Y = utils.extract_sklearn_univariate_target(self.targets, self.dataset) # Train the multiple regression. self._train_mr()
def train(self, df, targets, conditions): # Obtain the targets column. if len(targets) != 1: raise BLE( ValueError('RandomForest requires exactly one column in ' 'targets. Received {}'.format(targets))) if targets[0][1].lower() != 'categorical': raise BLE( ValueError('RandomForest can only classify CATEGORICAL ' 'columns. Received {}'.format(targets))) self.targets = [targets[0][0]] # Obtain the condition columns. if len(conditions) < 1: raise BLE( ValueError('RandomForest requires at least one column in ' 'conditions. Received {}'.format(conditions))) self.conditions_categorical = [] self.conditions_numerical = [] for c in conditions: if c[1].lower() == 'categorical': self.conditions_categorical.append(c[0]) else: self.conditions_numerical.append(c[0]) self.conditions = self.conditions_numerical + \ self.conditions_categorical # The dataset. self.dataset = pd.DataFrame() # Lookup for categoricals to code. self.categories_to_val_map = dict() # Training set (regressors and labels) self.X_numerical = np.ndarray(0) self.X_categorical = np.ndarray(0) self.Y = np.ndarray(0) # Random Forests. self.rf_partial = RandomForestClassifier(n_estimators=100) self.rf_full = RandomForestClassifier(n_estimators=100) # Preprocess the data. self.dataset = utils.extract_sklearn_dataset(self.conditions, self.targets, df) self.categories_to_val_map = utils.build_categorical_to_value_map( self.conditions_categorical, self.dataset) self.X_categorical = utils.extract_sklearn_features_categorical( self.conditions_categorical, self.categories_to_val_map, self.dataset) self.X_numerical = utils.extract_sklearn_features_numerical( self.conditions_numerical, self.dataset) self.Y = utils.extract_sklearn_univariate_target( self.targets, self.dataset) # Train the random forest. self._train_rf()
def test_extract_sklearn_features_numerical(): dataset = pd.DataFrame({ 'A': [1, 2, None, 4, 10], 'B': [0, -2, 3, None, 1], 'C': ['1', '2', '3', '4', '5'], 'D': [1, None, 3, 4, 5], }) columns = ['A', 'B'] matrix = sku.extract_sklearn_features_numerical(columns, dataset) # Test correct imputation. mean_A = (1 + 2 + 4 + 10) / 4. mean_B = (0 - 2 + 3 + 1) / 4. expected = np.asarray([[1, 2, mean_A, 4, 10], [0, -2, 3, mean_B, 1]]).T assert np.array_equal(matrix, expected)
def test_extract_sklearn_features_numerical(): dataset = pd.DataFrame({ 'A':[1, 2, None, 4, 10], 'B':[0, -2, 3, None, 1], 'C':['1', '2', '3', '4', '5'], 'D':[1, None, 3, 4, 5], }) columns = ['A', 'B'] matrix = sku.extract_sklearn_features_numerical(columns, dataset) # Test correct imputation. mean_A = (1 + 2 + 4 + 10) / 4. mean_B = (0 -2 + 3 + 1) / 4. expected = np.asarray([ [1, 2, mean_A, 4, 10], [0, -2, 3, mean_B, 1]]).T assert np.array_equal(matrix, expected)
def train(self, df, targets, conditions): # Obtain the targets column. if len(targets) != 1: raise BLE(ValueError('RandomForest requires exactly one column in ' 'targets. Received {}'.format(targets))) if targets[0][1].lower() != 'categorical': raise BLE(ValueError('RandomForest can only classify CATEGORICAL ' 'columns. Received {}'.format(targets))) self.targets = [targets[0][0]] # Obtain the condition columns. if len(conditions) < 1: raise BLE(ValueError('RandomForest requires at least one column in ' 'conditions. Received {}'.format(conditions))) self.conditions_categorical = [] self.conditions_numerical = [] for c in conditions: if c[1].lower() == 'categorical': self.conditions_categorical.append(c[0]) else: self.conditions_numerical.append(c[0]) self.conditions = self.conditions_numerical + \ self.conditions_categorical # The dataset. self.dataset = pd.DataFrame() # Lookup for categoricals to code. self.categories_to_val_map = dict() # Training set (regressors and labels) self.X_numerical = np.ndarray(0) self.X_categorical = np.ndarray(0) self.Y = np.ndarray(0) # Random Forests. self.rf_partial = RandomForestClassifier(n_estimators=100) self.rf_full = RandomForestClassifier(n_estimators=100) # Preprocess the data. self.dataset = utils.extract_sklearn_dataset(self.conditions, self.targets, df) self.categories_to_val_map = utils.build_categorical_to_value_map( self.conditions_categorical, self.dataset) self.X_categorical = utils.extract_sklearn_features_categorical( self.conditions_categorical, self.categories_to_val_map, self.dataset) self.X_numerical = utils.extract_sklearn_features_numerical( self.conditions_numerical, self.dataset) self.Y = utils.extract_sklearn_univariate_target(self.targets, self.dataset) # Train the random forest. self._train_rf()