def predict(self, X): if not hasattr(self, "estimators_"): raise TypeError() res = np.zeros((_num_samples(X), 1)) y = np.zeros((_num_samples(X), self.n_estimators)) for i, estimator in enumerate(self.estimators_): if not hasattr(estimator, "predict"): raise TypeError() y[:, i] = estimator.predict(X) for i, line in enumerate(y): res[i] = np.argmax(np.bincount(line)) return res
def split(self, X): if X is None: raise ValueError("The 'X' parameter should not be none.") n_samples = _num_samples(X) indices = np.arange(n_samples) for i in range(X.shape[0]): yield np.concatenate((indices[:i], indices[i + 1:])), indices[i]
def split(self, X): if X is None: raise ValueError("The 'X' parameter should not be none.") indices = [] n_samples = _num_samples(X) for i in range(n_samples): indices.append(np.random.randint(n_samples)) return indices, list(set(np.arange(n_samples)) - set(indices))
def train_test_split(X, y, test_size=0.3, shuffle=False): """留出法 只是在使用形式上与sklearn的函数一致,没有原函数那么复杂。 :param X: :param y: :param test_size: :param shuffle: :return: """ if not _num_samples(X) == _num_samples(y): raise ValueError( "The 'X' and 'y' should be equivalent in first dimension") indices = np.arange(_num_samples(y)) if shuffle: np.random.shuffle(indices) train_num = np.floor((1 - test_size) * len(y)).astype(int) X_train, X_test = X[indices[:train_num]], X[indices[train_num:]] y_train, y_test = y[indices[:train_num]], y[indices[train_num:]] return X_train, X_test, y_train, y_test
def one_hot_vector(y): y = check_array(y) n_samples = _num_samples(y) labels = np.unique(y) label_dict = dict() for i, label in enumerate(labels): label_dict[label] = i res = np.zeros((n_samples, len(labels))) for i, e in enumerate(res): e[label_dict[y[i]]] = 1 return res
def _median(X): """generate the median of each each neighbor value in X remember sort X first :param X: array like shape """ n_samples = _num_samples(X) if n_samples == 1: yield X[0] for i in range(n_samples - 1): yield (X[i] + X[i + 1]) / 2
def log_logistic(X): """计算log(1 / (1 + e ** -X)) Sigmoid函数的对数值 同时参照sklearn,这里在X_i < 0时,转化为X_i + log(1 + e ** X_i) :param X: :return: """ n_samples = _num_samples(X) out = np.zeros(n_samples) for i, x in enumerate(X): if x > 0: out[i] = -np.log(1 + np.exp(-x)) else: out[i] = x + np.log(1 + np.exp(x)) return out
def fit(self, X, y): """暂时只写二分类,y为1或者-1 # i的启发式选择,显然,在这里,当alpha1不等于C或者0时,意味着X[i]较可能为支持向量,则我们改变 # alpha[i]所带来的受益会更大(alpha[i]与w息息相关,对非支持向量更新alpha后可能依然为C或者0, # 这时w不会改变)。同时,因为支持向量满足uy=1,所以如果已经满足uy=1则不进行更新(给了一个tol的 # 允许误差范围) view the ref: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/tr-98-14.pdf(original paper) or the ref: http://www.cnblogs.com/jerrylead/archive/2011/03/18/1988419.html(in chinese) :param X: :param y: :return: """ n_samples = _num_samples(X) self.alphas = np.zeros(n_samples) self.w = np.dot(X.T, self.alphas * y) self.b = 0 self.errors = np.dot(X, self.w) - self.b - y self.fai = list() C = self.C num_changed = 0 examine_all = True iter_num = 0 while iter_num < 100 and (num_changed > 0 or examine_all): num_changed = 0 if examine_all: num_changed = sum( self.examineExample(i, X, y) for i in range(n_samples)) else: num_changed = sum( self.examineExample(i, X, y) for i in range(n_samples) if self.alphas[i] != 0 and self.alphas[i] != C) if examine_all: examine_all = False elif not num_changed: examine_all = True iter_num += 1 # u = np.dot(X, self.w) - self.b # fai = [] # self.r = (u - y) * y pass
def fit(self, X, y): """暂时只认为X只有一个样本来做 """ X, y = Check_X_y(X, y) n_samples = _num_samples(X) y = self.one_hot_vector(y) self.coef_ = [] # self.layer_values = [X] input_layer_size = X.shape[1] # TODO(suyako): +1? output_layer_size = y.shape[1] hidden_layer_sizes = list(self.hidden_layer_sizes) layer_units = ([input_layer_size] + hidden_layer_sizes + [output_layer_size]) self.n_layers = len(layer_units) for i in range(self.n_layers - 1): coef_ = self._init_coef(layer_units[i] + 1, layer_units[i + 1]) self.coef_.append(coef_) # TODO(suyako): 暂时构建到系数矩阵 self._backprop(X, y) # 反向传播更新权重矩阵
def split(self, X): """每次生成训练/测试集的索引 :param X: 样本 :return: 训练/测试集索引 """ if X is None: raise ValueError("The 'X' parameter should not be none.") n_samples = _num_samples(X) indices = np.arange(n_samples) if self.shuffle: np.random.shuffle(indices) n_splits = self.n_splits fold_sizes = n_samples // n_splits * np.ones(n_splits, dtype='int') fold_sizes[:n_samples % n_splits] += 1 start = 0 for i in range(n_splits): end = start + fold_sizes[i] yield np.concatenate( (indices[:start], indices[end:])), indices[start:end] start = end
def examineExample(self, i, X, y): """ 第一个if语句中为KKT条件判定,首先要明白alpha必定在0与C之间(初始化时alpha为0),然后分析如下: 1. 当alpha为0时,可知其对应的向量在支持向量外,KKT条件为uy>1,所以应满足uy-1>0 2. 当alpha为C时,可知其对应的向量在支持向量内,KKT条件为uy<1,所以应满足uy-1<0 3. 当alpha为0或C时,可知其为支持向量,KKT条件为uy=1,所以应满足uy-1=0 上述三个条件在实际计算中给了一个tol的容忍度 :param i: :param X: :param y: :return: """ y_i = y[i] alphas = self.alphas alpha1 = alphas[i] # u_i = np.dot(X[i].T, self.w) - self.b error_i = self.errors[i] r_i = error_i * y_i not_bound_alpha_index = self._get_none_bound_alpha_index() if (r_i < -self.tol and alpha1 < self.C) or (r_i > self.tol and alpha1 > 0): if len(not_bound_alpha_index) > 1: j = select(i, self.errors, not_bound_alpha_index) if self._updated(i, j, X, y): return 1 for j in not_bound_alpha_index: if self._updated(i, j, X, y): return 1 all_index = np.arange(_num_samples(alphas)) np.random.shuffle(all_index) for j in all_index: if self._updated(i, j, X, y): return 1 return 0
def _get_none_bound_alpha_index(self): flags = (self.alphas > 0) & (self.alphas < self.C) index = np.arange(_num_samples(self.alphas)) index = index[flags] np.random.shuffle(index) return index
def shuffle(X, y, max_iter=100): n_samples = _num_samples(X) for iter in range(max_iter): index = np.arange(n_samples) np.random.shuffle(index) yield X[index], y[index]