def fit(self, X, y):
        """
        Fit the tree classifier

        Arguments:
            X {[df]} -- Dataframe containing the features
            y {pd.series} -- Label vector
        """

        X = assert_df(X)
        y = assert_series(y)
        self.attributes = [
            a for a in self.domain.attributes if a.name in X.columns.values
        ]
        self.columns = [a.name for a in self.attributes]

        s_domain = Domain(self.attributes, class_vars=self.domain.class_var)
        rows = pd.concat([X[self.columns], y], axis=1).values.tolist()
        train = Table.from_list(domain=s_domain, rows=rows)

        if isinstance(self.domain.class_var, DiscreteVariable):
            self.tree = TreeClassifier().fit_storage(train)
        else:
            self.tree = TreeRegressor().fit_storage(train)
        return self
 def __init__(self, f_types, l_type, shape, **kwargs):
     self.f_types = assert_series(f_types)
     self.l_type = assert_l_type(l_type)
     self.shape = shape
     self._init_parameters(**kwargs)
     self.is_fitted = False
     self.names = self.f_types.index.tolist()
     self.feature_importances = {name: -1 for name in self.names}
    def __init__(self, f_types, l_type, **kwargs):
        """
        Class which predicts label for unseen samples

        Arguments:
            f_types {pd.series} -- Series of feature types
            l_type {str} -- Type of label
        """
        self.f_types = assert_series(f_types)
        self.l_type = assert_l_type(l_type)
        self.params = {
            "knn_neighbors": kwargs.get("knn_neighbors", 6),
            "nominal_distance": kwargs.get("nominal_distance", 1),
            "distance_metric": kwargs.get("distance_metric", "partial"),
        }
    def fit(self, X, y):
        if self.is_fitted:
            print("Selector is already fitted")
            return self

        X = assert_df(X).reset_index(drop=True)
        y = assert_series(y).reset_index(drop=True)
        data = Data(X, y, self.f_types, self.l_type, X.shape)
        self.data = data.shuffle_rows()

        self.domain = None
        if self.params["eval_method"] == "tree":
            self.domain = self.data.to_table().domain

        if self.params["eval_method"] == "mi":
            self.data = self.data.add_salt()

        self._fit()
        self.is_fitted = True
        return self
Пример #5
0
def _get_mi_cc(X, y, f_types, l_type, k, dist):
    """
    Estimate mutual information for continous label types
    and at least one continous feature
    Checks how many samples are inside a given radius

    Arguments:
        
    """
    nx = np.ones(X.shape[0]) * -1
    ny = np.ones(X.shape[0]) * -1

    D_x = get_dist_matrix(X, f_types, nominal_distance=dist)
    D_x.sort()

    new_y = assert_df(y)
    new_types = assert_series(l_type)
    D_y = get_dist_matrix(new_y, new_types, nominal_distance=dist)
    D_y.sort()

    for row in range(X.shape[0]):
        # Get distances inside features and labels
        dist_x = D_x[row, :]
        dist_y = D_y[row, :]

        # Update statistics if sample contains non-nan values
        radius = max(dist_x[k + 1], dist_y[k + 1])
        if not np.isinf(radius):
            nx[row] = (dist_x <= radius).sum() - 1
            ny[row] = (dist_y <= radius).sum() - 1

    nx = nx[nx >= 0]
    ny = ny[nx >= 0]

    mi = digamma(len(nx)) + digamma(k) - (1 / k) - \
        digamma(np.mean(nx)) - digamma(np.mean(ny))
    return max(mi, 0)