class FeatherCache(Cache): """Cache data in a feather file. Note that this node requires the following packages: * [pyarrow](https://pypi.org/project/pyarrow/) """ CODE_URL = get_node_url("caches/feather_cache.py") def __init__(self, filename: Optional[str] = None): if filename is not None: self.filename = filename else: self.filename = f"FeatherCache-{uuid.uuid4()}.feather" def fit(self, data: Optional[pd.DataFrame] = None) -> None: if not self.is_cached(): data.reset_index(drop=True).to_feather(self.filename) def run(self, data: Optional[pd.DataFrame] = None) -> pd.DataFrame: if self.is_cached(): return pd.read_feather(self.filename) else: return data def is_cached(self) -> bool: return os.path.exists(self.filename) and os.path.isfile(self.filename) def clear_cache(self) -> None: os.remove(self.filename)
class Collate(Node): """Collate multiple data streams into a single one""" CODE_URL = get_node_url("filters/collate.py") def run(self, *args): # Split into X and y X = [] y = [] for arg in args: if not is_empty(arg): if arg[0] is not None: X.append(arg[0]) if arg[1] is not None: y.append(arg[1]) # Concatenate X (should never be None) X = pd.concat(X).sort_index() # Concatenate y if len(y) == 0: y = None else: y = pd.concat(y).sort_index() return X, y
class NullFallback(Node): """Replaces missing values with values from a different column Parameters ---------- fallback_list : List[Tuple[str, str]] Fallbacks to use. Should be a list of tuples, where the first element in the tuple is the feature name with potentially missing elements, and the second element in the tuple is the feature name to replace any missing values with. n : int Number of times to perform the replacements sequentially (doing repeated times will propagate the replacements) """ CODE_URL = get_node_url("transforms/null_fallback.py") def __init__(self, fallback_list: List[Tuple[str, str]], n: int = 1): super().__init__() self.fallback_list = fallback_list self.n = n def run(self, X: pd.DataFrame, y: Optional[pd.Series] ) -> Tuple[pd.DataFrame, Optional[pd.Series]]: for _ in range(self.n): for missing, less_missing in self.fallback_list: ix = X[missing].isnull() X.loc[ix, missing] = X.loc[ix, less_missing] return X, y
class MeanSquaredError(Metric): CODE_URL = get_node_url("metrics/mean_squared_error.py") def run(self, y_pred: pd.Series, y_true: pd.Series): return ((y_pred - y_true).pow(2)).mean() def get_metric_name(self): return "mean_squared_error"
class MedianAbsolutePercentageError(Metric): CODE_URL = get_node_url("metrics/median_absolute_percentage_error.py") def run(self, y_pred: pd.Series, y_true: pd.Series): return 100 * ((y_pred - y_true).abs() / y_true).median() def get_metric_name(self): return "median_absolute_percentage_error"
class FeatureFilter(Node): """Filter features / fields down to a specific subset""" CODE_URL = get_node_url("filters/feature_filter.py") def __init__(self, features: List[str]): self.features = features def run(self, X: pd.DataFrame, y: Optional[pd.Series]): return X[self.features], y
class ItemFilter(Node): """Filter datapoints down to a subset matching some condition""" CODE_URL = get_node_url("filters/item_filter.py") def __init__(self, filter_function: Callable): self.filter_function = filter_function def run(self, X: pd.DataFrame, y: Optional[pd.Series]): ix = self.filter_function(X) if ix.sum() == 0: return EMPTY elif y is None: return X.loc[ix, :], None else: return X.loc[ix, :], y.loc[ix]
class CatBoostRegressorModel(Model): """A CatBoost regression model Parameters ---------- verbose : bool Whether to print information during training thread_count : int How many threads to use for fitting cat_features : List[str] List of feature names to treat as categorical loss_function : str What loss function to use. Some examples include: * "RMSE" * "Quantile:alpha=0.5" * "Quantile:alpha=0.15" kwargs All keyword arguments (including the above) are passed to CatBoostRegressor. Examples -------- ```python model = CatBoostRegressorModel( "model_name", verbose=False, thread_count=4, loss_function="Quantile:alpha=0.5", ) ``` """ CODE_URL = get_node_url("models/catboost_regressor_model.py") def __init__(self, **kwargs): self.model = CatBoostRegressor(**kwargs) def fit(self, X: pd.DataFrame, y: Optional[pd.Series]) -> None: self.model = self.model.fit(X, y) def predict(self, X: pd.DataFrame) -> pd.Series: return pd.Series(data=self.model.predict(X), index=X.index)
class RenameFields(Node): """Rename fields / columns Parameters ---------- field_map : Dict[str, str] Dictionary with fields to rename and their new names. Keys should be the old/existing field name, and the corresponding values should be the new names. """ CODE_URL = get_node_url("transforms/rename_fields.py") def __init__(self, field_map: Dict[str, str]): self.field_map = field_map def run(self, X: pd.DataFrame): for old_name, new_name in self.field_map.items(): X.rename(columns=self.field_map, inplace=True) return X
class CsvLoader(Loader): """Loads data from a csv file Parameters ---------- filename : str Filename of the csv file to load columns : List[str] Columns to load """ CODE_URL = get_node_url("loaders/csv_loader.py") def __init__(self, filename: str, columns: Optional[List[str]], **kwargs): self.filename = filename self.columns = columns self.kwargs = kwargs def run(self): return pd.read_csv(self.filename, usecols=self.columns, **self.kwargs)
class LogTransformedModel(Model): """Log transform and normalize the target variable before fitting Parameters ---------- base_model Model to fit on the log-transformed target data Examples -------- ```python model = LogTransformedModel( CatBoostRegressorModel( verbose=False, thread_count=4, loss_function="Quantile:alpha=0.5", ) ) ``` """ CODE_URL = get_node_url("models/log_transformed_model.py") def __init__(self, base_model): self.base_model = base_model self._mean = None self._std = None def fit(self, X: pd.DataFrame, y: pd.Series) -> None: log_y = np.log(y) self._mean = np.nanmean(log_y) self._std = np.nanstd(log_y) self.base_model.fit(X, (log_y - self._mean) / self._std) def predict(self, X: pd.DataFrame) -> pd.Series: return np.exp(self.base_model.predict(X) * self._std + self._mean)
class SqlAlchemyLoader(Loader): """Loads data via a SQL query using SQLAlchemy .. admonition:: This Loader requires the sqlalchemy pakage is installed, and also the jinja2 package if you use `query_kwargs`. Parameters ---------- engine : sqlalchemy engine SqlAlchemy engine for connecting to the database query_filename : str Filename of a file containing the SQL query. The file can use Jinja templating, and `query_args` will be used as the template parameters. query_kwargs : dict Dictionary of arguments to be used to template the query """ CODE_URL = get_node_url("loaders/sqlalchemy_loader.py") def __init__(self, engine, query_filename, query_kwargs={}): self.engine = engine self.query = self.format_query(query_filename, query_kwargs) def format_query(self, query_filename, query_kwargs): with open(query_filename, "r") as fid: if query_kwargs: from jinja2 import Template return Template(fid.read()).render(**query_kwargs) else: return fid.read() def run(self): return pd.read_sql(self.query, self.engine)
class InMemoryCache(Cache): """Cache data in memory""" CODE_URL = get_node_url("caches/in_memory_cache.py") def __init__(self): self._data = None def fit(self, data: Optional[pd.DataFrame] = None) -> None: if self._data is None: self._data = data def run(self, data: Optional[pd.DataFrame] = None) -> pd.DataFrame: if self.is_cached(): return self._data else: return data def is_cached(self) -> bool: return self._data is not None def clear_cache(self) -> None: del self._data self._data = None
class Imputer(Node): """Impute missing values Parameters ---------- features : None or str or List[str] Features to impute. If None (the default), will impute missing values for all features. method : str {'mean' or 'median'} Method to use for imputation. * mean: fill missing values with the mean value for that feature * median: fill missing values with the median value for that feature """ CODE_URL = get_node_url("transforms/imputer.py") continuous_methods = { "mean": lambda c: c.mean(), "median": lambda c: c.median(), } categorical_methods = { "mode": lambda c: c.mode().iloc[0], } categorical_dtypes = ["object", "category"] def __init__( self, features: Union[str, List[str]] = [], continuous_method: str = "mean", categorical_method: str = "mode", ): super().__init__() # Check methods are valid if continuous_method not in self.continuous_methods: raise ValueError( f"Invalid continuous_method '{continuous_method}'. " f"Valid values are: {list(self.continuous_methods)}") if categorical_method not in self.categorical_methods: raise ValueError( f"Invalid categorical_method '{categorical_method}'. " f"Valid values are: {list(self.categorical_methods)}") # Store attributes self.features = features if isinstance(features, list) else [features] self.continuous_method = self.continuous_methods[continuous_method] self.categorical_method = self.categorical_methods[categorical_method] def fit(self, X: pd.DataFrame, y: pd.Series) -> None: # Impute for all features by default if len(self.features) == 0: self.features = X.columns.tolist() # Compute the value to use for imputation self.values = {} for feature in self.features: if X[feature].dtype.name in self.categorical_dtypes: self.values[feature] = self.categorical_method(X[feature]) else: self.values[feature] = self.continuous_method(X[feature]) def run(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]: for feature in self.features: X.loc[X[feature].isnull(), feature] = self.values[feature] return X, y