示例#1
0
class FeatherCache(Cache):
    """Cache data in a feather file.

    Note that this node requires the following packages:

    * [pyarrow](https://pypi.org/project/pyarrow/)
    """

    CODE_URL = get_node_url("caches/feather_cache.py")

    def __init__(self, filename: Optional[str] = None):
        if filename is not None:
            self.filename = filename
        else:
            self.filename = f"FeatherCache-{uuid.uuid4()}.feather"

    def fit(self, data: Optional[pd.DataFrame] = None) -> None:
        if not self.is_cached():
            data.reset_index(drop=True).to_feather(self.filename)

    def run(self, data: Optional[pd.DataFrame] = None) -> pd.DataFrame:
        if self.is_cached():
            return pd.read_feather(self.filename)
        else:
            return data

    def is_cached(self) -> bool:
        return os.path.exists(self.filename) and os.path.isfile(self.filename)

    def clear_cache(self) -> None:
        os.remove(self.filename)
示例#2
0
class Collate(Node):
    """Collate multiple data streams into a single one"""

    CODE_URL = get_node_url("filters/collate.py")

    def run(self, *args):

        # Split into X and y
        X = []
        y = []
        for arg in args:
            if not is_empty(arg):
                if arg[0] is not None:
                    X.append(arg[0])
                if arg[1] is not None:
                    y.append(arg[1])

        # Concatenate X (should never be None)
        X = pd.concat(X).sort_index()

        # Concatenate y
        if len(y) == 0:
            y = None
        else:
            y = pd.concat(y).sort_index()

        return X, y
示例#3
0
class NullFallback(Node):
    """Replaces missing values with values from a different column

    Parameters
    ----------
    fallback_list : List[Tuple[str, str]]
        Fallbacks to use.  Should be a list of tuples, where the first element
        in the tuple is the feature name with potentially missing elements, and
        the second element in the tuple is the feature name to replace any
        missing values with.
    n : int
        Number of times to perform the replacements sequentially (doing
        repeated times will propagate the replacements)
    """

    CODE_URL = get_node_url("transforms/null_fallback.py")

    def __init__(self, fallback_list: List[Tuple[str, str]], n: int = 1):
        super().__init__()
        self.fallback_list = fallback_list
        self.n = n

    def run(self, X: pd.DataFrame, y: Optional[pd.Series]
            ) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
        for _ in range(self.n):
            for missing, less_missing in self.fallback_list:
                ix = X[missing].isnull()
                X.loc[ix, missing] = X.loc[ix, less_missing]
        return X, y
class MeanSquaredError(Metric):
    CODE_URL = get_node_url("metrics/mean_squared_error.py")

    def run(self, y_pred: pd.Series, y_true: pd.Series):
        return ((y_pred - y_true).pow(2)).mean()

    def get_metric_name(self):
        return "mean_squared_error"
class MedianAbsolutePercentageError(Metric):
    CODE_URL = get_node_url("metrics/median_absolute_percentage_error.py")

    def run(self, y_pred: pd.Series, y_true: pd.Series):
        return 100 * ((y_pred - y_true).abs() / y_true).median()

    def get_metric_name(self):
        return "median_absolute_percentage_error"
示例#6
0
class FeatureFilter(Node):
    """Filter features / fields down to a specific subset"""

    CODE_URL = get_node_url("filters/feature_filter.py")

    def __init__(self, features: List[str]):
        self.features = features

    def run(self, X: pd.DataFrame, y: Optional[pd.Series]):
        return X[self.features], y
示例#7
0
class ItemFilter(Node):
    """Filter datapoints down to a subset matching some condition"""

    CODE_URL = get_node_url("filters/item_filter.py")

    def __init__(self, filter_function: Callable):
        self.filter_function = filter_function

    def run(self, X: pd.DataFrame, y: Optional[pd.Series]):
        ix = self.filter_function(X)
        if ix.sum() == 0:
            return EMPTY
        elif y is None:
            return X.loc[ix, :], None
        else:
            return X.loc[ix, :], y.loc[ix]
class CatBoostRegressorModel(Model):
    """A CatBoost regression model


    Parameters
    ----------
    verbose : bool
        Whether to print information during training
    thread_count : int
        How many threads to use for fitting
    cat_features : List[str]
        List of feature names to treat as categorical
    loss_function : str
        What loss function to use.  Some examples include:

        * "RMSE"
        * "Quantile:alpha=0.5"
        * "Quantile:alpha=0.15"

    kwargs
        All keyword arguments (including the above) are passed to
        CatBoostRegressor.


    Examples
    --------

    ```python
    model = CatBoostRegressorModel(
        "model_name",
        verbose=False,
        thread_count=4,
        loss_function="Quantile:alpha=0.5",
    )
    ```
    """

    CODE_URL = get_node_url("models/catboost_regressor_model.py")

    def __init__(self, **kwargs):
        self.model = CatBoostRegressor(**kwargs)

    def fit(self, X: pd.DataFrame, y: Optional[pd.Series]) -> None:
        self.model = self.model.fit(X, y)

    def predict(self, X: pd.DataFrame) -> pd.Series:
        return pd.Series(data=self.model.predict(X), index=X.index)
示例#9
0
class RenameFields(Node):
    """Rename fields / columns

    Parameters
    ----------
    field_map : Dict[str, str]
        Dictionary with fields to rename and their new names.  Keys should be
        the old/existing field name, and the corresponding values should be the
        new names.
    """

    CODE_URL = get_node_url("transforms/rename_fields.py")

    def __init__(self, field_map: Dict[str, str]):
        self.field_map = field_map

    def run(self, X: pd.DataFrame):
        for old_name, new_name in self.field_map.items():
            X.rename(columns=self.field_map, inplace=True)
            return X
示例#10
0
class CsvLoader(Loader):
    """Loads data from a csv file

    Parameters
    ----------
    filename : str
        Filename of the csv file to load
    columns : List[str]
        Columns to load
    """

    CODE_URL = get_node_url("loaders/csv_loader.py")

    def __init__(self, filename: str, columns: Optional[List[str]], **kwargs):
        self.filename = filename
        self.columns = columns
        self.kwargs = kwargs

    def run(self):
        return pd.read_csv(self.filename, usecols=self.columns, **self.kwargs)
class LogTransformedModel(Model):
    """Log transform and normalize the target variable before fitting


    Parameters
    ----------
    base_model
        Model to fit on the log-transformed target data


    Examples
    --------

    ```python
    model = LogTransformedModel(
        CatBoostRegressorModel(
            verbose=False,
            thread_count=4,
            loss_function="Quantile:alpha=0.5",
        )
    )
    ```
    """

    CODE_URL = get_node_url("models/log_transformed_model.py")

    def __init__(self, base_model):
        self.base_model = base_model
        self._mean = None
        self._std = None

    def fit(self, X: pd.DataFrame, y: pd.Series) -> None:
        log_y = np.log(y)
        self._mean = np.nanmean(log_y)
        self._std = np.nanstd(log_y)
        self.base_model.fit(X, (log_y - self._mean) / self._std)

    def predict(self, X: pd.DataFrame) -> pd.Series:
        return np.exp(self.base_model.predict(X) * self._std + self._mean)
示例#12
0
class SqlAlchemyLoader(Loader):
    """Loads data via a SQL query using SQLAlchemy

    .. admonition::

        This Loader requires the sqlalchemy pakage is installed, and also the
        jinja2 package if you use `query_kwargs`.

    Parameters
    ----------
    engine : sqlalchemy engine
        SqlAlchemy engine for connecting to the database
    query_filename : str
        Filename of a file containing the SQL query.  The file can use Jinja
        templating, and `query_args` will be used as the template parameters.
    query_kwargs : dict
        Dictionary of arguments to be used to template the query
    """

    CODE_URL = get_node_url("loaders/sqlalchemy_loader.py")

    def __init__(self, engine, query_filename, query_kwargs={}):
        self.engine = engine
        self.query = self.format_query(query_filename, query_kwargs)

    def format_query(self, query_filename, query_kwargs):
        with open(query_filename, "r") as fid:
            if query_kwargs:
                from jinja2 import Template

                return Template(fid.read()).render(**query_kwargs)
            else:
                return fid.read()

    def run(self):
        return pd.read_sql(self.query, self.engine)
示例#13
0
class InMemoryCache(Cache):
    """Cache data in memory"""

    CODE_URL = get_node_url("caches/in_memory_cache.py")

    def __init__(self):
        self._data = None

    def fit(self, data: Optional[pd.DataFrame] = None) -> None:
        if self._data is None:
            self._data = data

    def run(self, data: Optional[pd.DataFrame] = None) -> pd.DataFrame:
        if self.is_cached():
            return self._data
        else:
            return data

    def is_cached(self) -> bool:
        return self._data is not None

    def clear_cache(self) -> None:
        del self._data
        self._data = None
示例#14
0
class Imputer(Node):
    """Impute missing values

    Parameters
    ----------
    features : None or str or List[str]
        Features to impute.  If None (the default), will impute missing values
        for all features.
    method : str {'mean' or 'median'}
        Method to use for imputation.

        * mean: fill missing values with the mean value for that feature
        * median: fill missing values with the median value for that feature
    """

    CODE_URL = get_node_url("transforms/imputer.py")

    continuous_methods = {
        "mean": lambda c: c.mean(),
        "median": lambda c: c.median(),
    }

    categorical_methods = {
        "mode": lambda c: c.mode().iloc[0],
    }

    categorical_dtypes = ["object", "category"]

    def __init__(
        self,
        features: Union[str, List[str]] = [],
        continuous_method: str = "mean",
        categorical_method: str = "mode",
    ):

        super().__init__()

        # Check methods are valid
        if continuous_method not in self.continuous_methods:
            raise ValueError(
                f"Invalid continuous_method '{continuous_method}'.  "
                f"Valid values are: {list(self.continuous_methods)}")
        if categorical_method not in self.categorical_methods:
            raise ValueError(
                f"Invalid categorical_method '{categorical_method}'.  "
                f"Valid values are: {list(self.categorical_methods)}")

        # Store attributes
        self.features = features if isinstance(features, list) else [features]
        self.continuous_method = self.continuous_methods[continuous_method]
        self.categorical_method = self.categorical_methods[categorical_method]

    def fit(self, X: pd.DataFrame, y: pd.Series) -> None:

        # Impute for all features by default
        if len(self.features) == 0:
            self.features = X.columns.tolist()

        # Compute the value to use for imputation
        self.values = {}
        for feature in self.features:
            if X[feature].dtype.name in self.categorical_dtypes:
                self.values[feature] = self.categorical_method(X[feature])
            else:
                self.values[feature] = self.continuous_method(X[feature])

    def run(self, X: pd.DataFrame,
            y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
        for feature in self.features:
            X.loc[X[feature].isnull(), feature] = self.values[feature]
        return X, y