示例#1
0
 def predict(self, X):
     if not hasattr(self, "estimators_"):
         raise TypeError()
     res = np.zeros((_num_samples(X), 1))
     y = np.zeros((_num_samples(X), self.n_estimators))
     for i, estimator in enumerate(self.estimators_):
         if not hasattr(estimator, "predict"):
             raise TypeError()
         y[:, i] = estimator.predict(X)
     for i, line in enumerate(y):
         res[i] = np.argmax(np.bincount(line))
     return res
示例#2
0
 def split(self, X):
     if X is None:
         raise ValueError("The 'X' parameter should not be none.")
     n_samples = _num_samples(X)
     indices = np.arange(n_samples)
     for i in range(X.shape[0]):
         yield np.concatenate((indices[:i], indices[i + 1:])), indices[i]
示例#3
0
 def split(self, X):
     if X is None:
         raise ValueError("The 'X' parameter should not be none.")
     indices = []
     n_samples = _num_samples(X)
     for i in range(n_samples):
         indices.append(np.random.randint(n_samples))
     return indices, list(set(np.arange(n_samples)) - set(indices))
示例#4
0
def train_test_split(X, y, test_size=0.3, shuffle=False):
    """留出法

    只是在使用形式上与sklearn的函数一致,没有原函数那么复杂。
    :param X:
    :param y:
    :param test_size:
    :param shuffle:
    :return:
    """
    if not _num_samples(X) == _num_samples(y):
        raise ValueError(
            "The 'X' and 'y' should be equivalent in first dimension")
    indices = np.arange(_num_samples(y))
    if shuffle:
        np.random.shuffle(indices)
    train_num = np.floor((1 - test_size) * len(y)).astype(int)
    X_train, X_test = X[indices[:train_num]], X[indices[train_num:]]
    y_train, y_test = y[indices[:train_num]], y[indices[train_num:]]
    return X_train, X_test, y_train, y_test
 def one_hot_vector(y):
     y = check_array(y)
     n_samples = _num_samples(y)
     labels = np.unique(y)
     label_dict = dict()
     for i, label in enumerate(labels):
         label_dict[label] = i
     res = np.zeros((n_samples, len(labels)))
     for i, e in enumerate(res):
         e[label_dict[y[i]]] = 1
     return res
示例#6
0
def _median(X):
    """generate the median of each each neighbor value in X

    remember sort X first
    :param X: array like shape
    """
    n_samples = _num_samples(X)
    if n_samples == 1:
        yield X[0]
    for i in range(n_samples - 1):
        yield (X[i] + X[i + 1]) / 2
示例#7
0
def log_logistic(X):
    """计算log(1 / (1 + e ** -X))

    Sigmoid函数的对数值
    同时参照sklearn,这里在X_i < 0时,转化为X_i + log(1 + e ** X_i)
    :param X:
    :return:
    """
    n_samples = _num_samples(X)
    out = np.zeros(n_samples)
    for i, x in enumerate(X):
        if x > 0:
            out[i] = -np.log(1 + np.exp(-x))
        else:
            out[i] = x + np.log(1 + np.exp(x))
    return out
示例#8
0
    def fit(self, X, y):
        """暂时只写二分类,y为1或者-1

        # i的启发式选择,显然,在这里,当alpha1不等于C或者0时,意味着X[i]较可能为支持向量,则我们改变
        # alpha[i]所带来的受益会更大(alpha[i]与w息息相关,对非支持向量更新alpha后可能依然为C或者0,
        # 这时w不会改变)。同时,因为支持向量满足uy=1,所以如果已经满足uy=1则不进行更新(给了一个tol的
        # 允许误差范围)

        view the ref: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-98-14.pdf(original paper)
        or the ref: http://www.cnblogs.com/jerrylead/archive/2011/03/18/1988419.html(in chinese)
        :param X:
        :param y:
        :return:
        """
        n_samples = _num_samples(X)
        self.alphas = np.zeros(n_samples)
        self.w = np.dot(X.T, self.alphas * y)
        self.b = 0
        self.errors = np.dot(X, self.w) - self.b - y
        self.fai = list()
        C = self.C
        num_changed = 0
        examine_all = True
        iter_num = 0
        while iter_num < 100 and (num_changed > 0 or examine_all):
            num_changed = 0
            if examine_all:
                num_changed = sum(
                    self.examineExample(i, X, y) for i in range(n_samples))
            else:
                num_changed = sum(
                    self.examineExample(i, X, y) for i in range(n_samples)
                    if self.alphas[i] != 0 and self.alphas[i] != C)
            if examine_all:
                examine_all = False
            elif not num_changed:
                examine_all = True
            iter_num += 1

        # u = np.dot(X, self.w) - self.b
        # fai = []
        # self.r = (u - y) * y

        pass
    def fit(self, X, y):
        """暂时只认为X只有一个样本来做

        """
        X, y = Check_X_y(X, y)
        n_samples = _num_samples(X)
        y = self.one_hot_vector(y)
        self.coef_ = []
        # self.layer_values = [X]
        input_layer_size = X.shape[1]  # TODO(suyako): +1?
        output_layer_size = y.shape[1]
        hidden_layer_sizes = list(self.hidden_layer_sizes)
        layer_units = ([input_layer_size] + hidden_layer_sizes +
                       [output_layer_size])
        self.n_layers = len(layer_units)
        for i in range(self.n_layers - 1):
            coef_ = self._init_coef(layer_units[i] + 1, layer_units[i + 1])
            self.coef_.append(coef_)  # TODO(suyako): 暂时构建到系数矩阵
        self._backprop(X, y)  # 反向传播更新权重矩阵
示例#10
0
    def split(self, X):
        """每次生成训练/测试集的索引

        :param X: 样本
        :return: 训练/测试集索引
        """
        if X is None:
            raise ValueError("The 'X' parameter should not be none.")
        n_samples = _num_samples(X)
        indices = np.arange(n_samples)
        if self.shuffle:
            np.random.shuffle(indices)
        n_splits = self.n_splits
        fold_sizes = n_samples // n_splits * np.ones(n_splits, dtype='int')
        fold_sizes[:n_samples % n_splits] += 1
        start = 0
        for i in range(n_splits):
            end = start + fold_sizes[i]
            yield np.concatenate(
                (indices[:start], indices[end:])), indices[start:end]
            start = end
示例#11
0
    def examineExample(self, i, X, y):
        """

        第一个if语句中为KKT条件判定,首先要明白alpha必定在0与C之间(初始化时alpha为0),然后分析如下:
        1. 当alpha为0时,可知其对应的向量在支持向量外,KKT条件为uy>1,所以应满足uy-1>0
        2. 当alpha为C时,可知其对应的向量在支持向量内,KKT条件为uy<1,所以应满足uy-1<0
        3. 当alpha为0或C时,可知其为支持向量,KKT条件为uy=1,所以应满足uy-1=0
        上述三个条件在实际计算中给了一个tol的容忍度

        :param i:
        :param X:
        :param y:
        :return:
        """
        y_i = y[i]
        alphas = self.alphas
        alpha1 = alphas[i]
        # u_i = np.dot(X[i].T, self.w) - self.b
        error_i = self.errors[i]
        r_i = error_i * y_i
        not_bound_alpha_index = self._get_none_bound_alpha_index()
        if (r_i < -self.tol and alpha1 < self.C) or (r_i > self.tol
                                                     and alpha1 > 0):
            if len(not_bound_alpha_index) > 1:
                j = select(i, self.errors, not_bound_alpha_index)
                if self._updated(i, j, X, y):
                    return 1
            for j in not_bound_alpha_index:
                if self._updated(i, j, X, y):
                    return 1
            all_index = np.arange(_num_samples(alphas))
            np.random.shuffle(all_index)
            for j in all_index:
                if self._updated(i, j, X, y):
                    return 1
        return 0
示例#12
0
 def _get_none_bound_alpha_index(self):
     flags = (self.alphas > 0) & (self.alphas < self.C)
     index = np.arange(_num_samples(self.alphas))
     index = index[flags]
     np.random.shuffle(index)
     return index
示例#13
0
def shuffle(X, y, max_iter=100):
    n_samples = _num_samples(X)
    for iter in range(max_iter):
        index = np.arange(n_samples)
        np.random.shuffle(index)
        yield X[index], y[index]