Пример #1
0
    def owen_zhang(x_train, y_train, x_test, seed=0, alpha=0, noise=0.01):
        """
        Owen Zhang's leave-one-out + noise likelihood encoding

        "Winning data science competitions"
        http://de.slideshare.net/ShangxuanZhang/winning-data-science-competitions-presented-by-owen-zhang
        """
        if len(x_train.shape) == 1:
            x_train = x_train.reshape(-1, 1)
            x_test = x_test.reshape(-1, 1)
        ncols = x_train.shape[1]
        nclass = np.unique(y_train).shape[0]
        if not is_numpy(x_train):
            x_train = np.array(x_train)
            x_test = np.array(x_test)

        xx_train = None
        xx_test = None

        for i in range(ncols):
            le_train = LikelihoodEstimator(noise=noise, alpha=alpha, leave_one_out=True, seed=seed). \
                fit(x_train[:, i], y_train)
            le_test = LikelihoodEstimator(noise=0, alpha=alpha, leave_one_out=False, seed=seed). \
                fit(x_train[:, i], y_train)
            lh_train = le_train.x_likelihoods.copy()
            lh_test = le_test.predict_proba(x_test[:, i])

            if nclass <= 2:
                lh_train = lh_train.T[1].reshape(-1, 1)
                lh_test = lh_test.T[1].reshape(-1, 1)

            xx_train = np.hstack((lh_train,)) if xx_train is None else np.hstack((xx_train, lh_train))
            xx_test = np.hstack((lh_test,)) if xx_test is None else np.hstack((xx_test, lh_test))

        return xx_train, xx_test
Пример #2
0
    def fit(self, x):
        self.ecdfs = {}
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        is_np = is_numpy(x)

        for i in range(ncols):
            self.ecdfs.update({i: ECDF(x[:, i] if is_np else x.iloc[:, i].values)})
        return self
Пример #3
0
    def fit(self, x):
        self.ecdfs = {}
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        is_np = is_numpy(x)

        for i in range(ncols):
            self.ecdfs.update(
                {i: ECDF(x[:, i] if is_np else x.iloc[:, i].values)})
        return self
Пример #4
0
    def fit(self, x, y):
        print x.shape
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        if not is_numpy(x):
            x = np.array(x)

        self.nclass = np.unique(y).shape[0]

        for i in range(ncols):
            self.estimators.append(LikelihoodEstimator(**self.get_params()).fit(x[:, i], y))
        return self
Пример #5
0
    def fit(self, x, y):
        print x.shape
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        if not is_numpy(x):
            x = np.array(x)

        self.nclass = np.unique(y).shape[0]

        for i in range(ncols):
            self.estimators.append(
                LikelihoodEstimator(**self.get_params()).fit(x[:, i], y))
        return self
Пример #6
0
    def transform(self, x):
        if self.copy:
            x = x.copy()
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        is_np = is_numpy(x)

        for i in range(ncols):
            ecdf = self.ecdfs[i]
            if is_np:
                x[:, i] = self.ppf(ecdf(x[:, i]))
            else:
                x.iloc[:, i] = self.ppf(ecdf(x.iloc[:, i]))
        return x
Пример #7
0
    def transform(self, x):
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        if not is_numpy(x):
            x = np.array(x)

        likelihoods = None

        for i in range(ncols):
            lh = self.estimators[i].predict(x[:, i])
            if self.nclass <= 2:
                lh = lh.T[1].reshape(-1, 1)
            likelihoods = np.hstack((lh,)) if likelihoods is None else np.hstack((likelihoods, lh))
        return likelihoods
Пример #8
0
    def transform(self, x):
        if self.copy:
            x = x.copy()
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        is_np = is_numpy(x)

        for i in range(ncols):
            ecdf = self.ecdfs[i]
            if is_np:
                x[:, i] = self.ppf(ecdf(x[:, i]))
            else:
                x.iloc[:, i] = self.ppf(ecdf(x.iloc[:, i]))
        return x
Пример #9
0
    def fit(self, x):
        self.new_values = {}
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        is_np = is_numpy(x)

        for i in range(ncols):
            if is_np:
                val = dict(Counter(x[:, i]))
            else:
                val = x.iloc[:, i].value_counts().to_dict()
            val = dict((k, self.value if v < self.threshold else k) for k, v in val.items())
            self.new_values.update({i: val})
        return self
Пример #10
0
    def fit(self, x):
        self.new_values = {}
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        is_np = is_numpy(x)

        for i in range(ncols):
            if is_np:
                val = dict(Counter(x[:, i]))
            else:
                val = x.iloc[:, i].value_counts().to_dict()
            val = dict((k, self.value if v < self.threshold else k)
                       for k, v in val.items())
            self.new_values.update({i: val})
        return self
Пример #11
0
    def fit(self, x):
        self.counts = {}
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        is_np = is_numpy(x)

        for i in range(ncols):
            if is_np:
                cnt = dict(Counter(x[:, i]))
            else:
                cnt = x.iloc[:, i].value_counts().to_dict()
            if self.min_count > 0:
                cnt = dict((k, self.nan_value if v < self.min_count else v) for k, v in cnt.items())
            self.counts.update({i: cnt})
        return self
Пример #12
0
    def transform(self, x):
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        if not is_numpy(x):
            x = np.array(x)

        likelihoods = None

        for i in range(ncols):
            lh = self.estimators[i].predict(x[:, i])
            if self.nclass <= 2:
                lh = lh.T[1].reshape(-1, 1)
            likelihoods = np.hstack(
                (lh, )) if likelihoods is None else np.hstack(
                    (likelihoods, lh))
        return likelihoods
Пример #13
0
    def transform(self, x):
        if self.copy:
            x = x.copy()
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        is_np = is_numpy(x)

        for i in range(ncols):
            cnt = self.counts[i]
            if is_np:
                k, v = np.array(list(zip(*sorted(cnt.items()))))
                ix = np.digitize(x[:, i], k, right=True)
                x[:, i] = v[ix]
            else:
                x.iloc[:, i].replace(cnt, inplace=True)
        return x
Пример #14
0
    def fit(self, x):
        self.counts = {}
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        is_np = is_numpy(x)

        for i in range(ncols):
            if is_np:
                cnt = dict(Counter(x[:, i]))
            else:
                cnt = x.iloc[:, i].value_counts().to_dict()
            if self.min_count > 0:
                cnt = dict((k, self.nan_value if v < self.min_count else v)
                           for k, v in cnt.items())
            self.counts.update({i: cnt})
        return self
Пример #15
0
    def transform(self, x):
        if self.copy:
            x = x.copy()
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        is_np = is_numpy(x)

        for i in range(ncols):
            cnt = self.counts[i]
            if is_np:
                k, v = np.array(list(zip(*sorted(cnt.items()))))
                ix = np.digitize(x[:, i], k, right=True)
                x[:, i] = v[ix]
            else:
                x.iloc[:, i].replace(cnt, inplace=True)
        return x
Пример #16
0
    def transform(self, x):
        if self.copy:
            x = x.copy()
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        is_np = is_numpy(x)

        if self.ive is not None:
            x = self.ive.transform(x)

        for i in range(ncols):
            enc = self.encoders[i]
            if is_np:
                x[:, i] = enc.transform(x[:, i]) + self.first_category
            else:
                x.iloc[:, i] = enc.transform(x.iloc[:, i]) + self.first_category
        return x
Пример #17
0
    def fit(self, x):
        self.encoders = {}
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        is_np = is_numpy(x)

        if self.min_count > 0:
            self.ive = InfrequentValueEncoder(threshold=self.min_count, value=np.finfo(float).min)
            x = self.ive.fit_transform(x)

        for i in range(ncols):
            if is_np:
                enc = LabelEncoder().fit(x[:, i])
            else:
                enc = LabelEncoder().fit(x.iloc[:, i])
            self.encoders.update({i: enc})
        return self
Пример #18
0
    def fit(self, x):
        self.encoders = {}
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        is_np = is_numpy(x)

        if self.min_count > 0:
            self.ive = InfrequentValueEncoder(threshold=self.min_count,
                                              value=np.finfo(float).min)
            x = self.ive.fit_transform(x)

        for i in range(ncols):
            if is_np:
                enc = LabelEncoder().fit(x[:, i])
            else:
                enc = LabelEncoder().fit(x.iloc[:, i])
            self.encoders.update({i: enc})
        return self
Пример #19
0
    def transform(self, x):
        if self.copy:
            x = x.copy()
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        is_np = is_numpy(x)

        if self.ive is not None:
            x = self.ive.transform(x)

        for i in range(ncols):
            enc = self.encoders[i]
            if is_np:
                x[:, i] = enc.transform(x[:, i]) + self.first_category
            else:
                x.iloc[:,
                       i] = enc.transform(x.iloc[:, i]) + self.first_category
        return x
Пример #20
0
    def owen_zhang(x_train, y_train, x_test, seed=0, alpha=0, noise=0.01):
        """
        Owen Zhang's leave-one-out + noise likelihood encoding

        "Winning data science competitions"
        http://de.slideshare.net/ShangxuanZhang/winning-data-science-competitions-presented-by-owen-zhang
        """
        if len(x_train.shape) == 1:
            x_train = x_train.reshape(-1, 1)
            x_test = x_test.reshape(-1, 1)
        ncols = x_train.shape[1]
        nclass = np.unique(y_train).shape[0]
        if not is_numpy(x_train):
            x_train = np.array(x_train)
            x_test = np.array(x_test)

        xx_train = None
        xx_test = None

        for i in range(ncols):
            le_train = LikelihoodEstimator(noise=noise, alpha=alpha, leave_one_out=True, seed=seed). \
                fit(x_train[:, i], y_train)
            le_test = LikelihoodEstimator(noise=0, alpha=alpha, leave_one_out=False, seed=seed). \
                fit(x_train[:, i], y_train)
            lh_train = le_train.x_likelihoods.copy()
            lh_test = le_test.predict_proba(x_test[:, i])

            if nclass <= 2:
                lh_train = lh_train.T[1].reshape(-1, 1)
                lh_test = lh_test.T[1].reshape(-1, 1)

            xx_train = np.hstack(
                (lh_train, )) if xx_train is None else np.hstack(
                    (xx_train, lh_train))
            xx_test = np.hstack((lh_test, )) if xx_test is None else np.hstack(
                (xx_test, lh_test))

        return xx_train, xx_test