def predict(self, x_mat: ndarray) -> Union[ndarray, int]: """ 返回预测值,是对应于 x_mat 的标记。 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :return: 预测标记 """ if not self._trained: raise StateError('not trained yet') x_mat = _t.r2m(x_mat) pred = self.__predict(x_mat) if self.strategy == 'ovr': return _t.ret(pred) else: m = x_mat.shape[0] result = np.empty((m, )) for i in range(m): r = pred[:, i] result[i] = sorted([(np.sum(r == label), label) for label in set(r)])[-1][1] return _t.ret(result)
def cost(self, x_mat: ndarray, y_row: ndarray) -> float: """ 计算在 x_mat 和 y_row 上的代价。 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :param y_row: 输出行向量,每一个值代表 x_mat 中对应行的输出 :return: 代价值。 """ if self._theta is None: raise StateError('not trained yet') self._theta, x_mat = _t.match_theta_x(self._theta, x_mat) x_mat, y_row = _t.match_x_y(x_mat, y_row) if len(self.labels) == 2: return self.__cost(self._theta, x_mat, y_row) else: m = x_mat.shape[0] cost_sum = 0 for i, label in enumerate(self.labels): y = y_row == label cost_sum = cost_sum + np.sum(y) * self.__cost( self._theta[:, i], x_mat, y) / m return cost_sum
def extract7z(src_archive, extract_dir, progress=None, readExtensions=None, recursive=False, filelist_to_extract=None): command = _extract_command(src_archive, extract_dir, recursive, filelist_to_extract) proc = subprocess.Popen(command, stdout=subprocess.PIPE, bufsize=1, stdin=subprocess.PIPE, startupinfo=startupinfo) # Error checking, progress feedback and subArchives for recursive unpacking index, errorLine, subArchives = 0, u'', [] source_archive = src_archive.tail.s with proc.stdout as out: for line in iter(out.readline, b''): line = unicode(line, 'utf8') if regErrMatch(line): errorLine = line + u''.join(out) break maExtracting = regExtractMatch(line) if maExtracting: extracted = GPath(maExtracting.group(1).strip()) if readExtensions and extracted.cext in readExtensions: subArchives.append(extracted) if not progress: continue progress(index, source_archive + u'\n' + _( u'Extracting files...') + u'\n' + extracted.s) index += 1 returncode = proc.wait() if returncode or errorLine: raise StateError(u'%s: Extraction failed:\n7z.exe return value: %s\n%s' % (source_archive, str(returncode), errorLine)) return subArchives
def compress7z(command, outDir, destArchive, srcDir, progress=None): outFile = outDir.join(destArchive) if progress is not None: #--Used solely for the progress bar length = sum([len(files) for x, y, files in walkdir(srcDir.s)]) progress(0, destArchive.s + u'\n' + _(u'Compressing files...')) progress.setFull(1 + length) #--Pack the files proc = subprocess.Popen(command, stdout=subprocess.PIPE, bufsize=1, stdin=subprocess.PIPE, # needed for some commands startupinfo=startupinfo) #--Error checking and progress feedback index, errorLine = 0, u'' with proc.stdout as out: for line in iter(out.readline, b''): line = unicode(line, 'utf8') # utf-8 is ok see bosh.compressCommand if regErrMatch(line): errorLine = line + u''.join(out) break if progress is None: continue maCompressing = regCompressMatch(line) if maCompressing: progress(index, destArchive.s + u'\n' + _( u'Compressing files...') + u'\n' + maCompressing.group( 1).strip()) index += 1 returncode = proc.wait() if returncode or errorLine: outFile.temp.remove() raise StateError(destArchive.s + u': Compression failed:\n' + u'7z.exe return value: ' + str(returncode) + u'\n' + errorLine) #--Finalize the file, and cleanup outFile.untemp()
def wrapPopenOut(command, wrapper, errorMsg): proc = subprocess.Popen(command, stdout=subprocess.PIPE, bufsize=-1, stdin=subprocess.PIPE, startupinfo=startupinfo) out, unused_err = proc.communicate() wrapper(out) returncode = proc.returncode if returncode: raise StateError(errorMsg + u'\nPopen return value: %d' + returncode)
def cost(self, x_mat: ndarray, y_row: ndarray) -> float: """ 计算在 x_mat 和 y_row 上的代价。此方法只计算参与了分类结果的学习器。 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :param y_row: 输出行向量,每一个值代表 x_mat 中对应行的输出 :return: 代价值。 """ if not self._trained: raise StateError('not trained yet') x_mat, y_row = _t.match_x_y(x_mat, y_row, add_ones=False) pred = self.__predict(x_mat) cost_sum = 0 m = x_mat.shape[0] if self.strategy == 'ovr': for i, label in enumerate(pred): y = y_row[i] if y == label: y = self.positive_label if self.positive_label is not None else label else: y = self.negative_label if self.negative_label is not None else label + 1 cost_sum = cost_sum + self._learners[label].cost( x_mat[i, :], np.array([y])) else: result = np.empty((m, )) for i in range(m): r = pred[:, i] result[i] = sorted([(np.sum(r == label), label) for label in set(r)])[-1][1] for i in range(m): sub_cost_sum = 0 count = 0 for j, ((positive, negative), learner) in enumerate(self._learners.items()): if pred[j][i] == result[i]: y = y_row[i] if y == positive: y = self.positive_label if self.positive_label is not None else positive else: y = self.negative_label if self.negative_label is not None else negative sub_cost_sum = sub_cost_sum + learner.cost( x_mat[i, :], np.array([y])) count = count + 1 if count != 0: cost_sum = cost_sum + sub_cost_sum / count return cost_sum / m
def probability(self, x_mat: ndarray) -> Union[ndarray, float]: """ 返回对应于 x_mat 的预测概率。如果是二分类问题,那么返回一个行向量;如果是多分类问题,返回一个 m*num_labels 的矩阵,其中每一行表示样本在每个类上的概率。 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :return: 预测概率。 """ if self._theta is None: raise StateError('not trained yet') self._theta, x_mat = _t.match_theta_x(self._theta, x_mat) return _mf.sigmoid(x_mat @ self._theta)
def cost(self, x_mat: ndarray, y_row: ndarray): """ 计算在 x_mat 和 y_row 上的代价。 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :param y_row: 输出行向量,每一个值代表 x_mat 中对应行的输出 :return: 代价值 """ if self._theta_row is None: raise StateError('not trained yet') self._theta_row, x_mat = _t.match_theta_x(self._theta_row, x_mat) x_mat, y_row = _t.match_x_y(x_mat, y_row) return self.__cost(self._theta_row, x_mat, y_row)
def __match_theta_x(self, x_mat: ndarray) -> ndarray: """ 检查输入是否和参数匹配。 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :return: """ if self._theta is None: raise StateError('not trained yet') x_mat = _t.r2m(x_mat) if x_mat.shape[1] != self._theta.shape[0]: raise DataNotMatchError('feature quantity mismatch') return x_mat
def predict(self, x_mat: ndarray) -> Union[ndarray, int]: """ 对新的数据进行预测。当 x_mat 中只有一个样本时返回一个数字,否则返回一个行向量。 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :return: 预测值或向量 """ if self._root is None: raise StateError('not trained yet') x_mat = _t.r2m(x_mat) if x_mat.shape[1] != len(self.pvs): raise DataNotMatchError('feature quantity mismatch') return self.__predict(self._root, x_mat)
def probability(self, x_mat: ndarray) -> Union[ndarray, float]: """ 返回对应于 x_mat 的预测概率。如果是二分类问题,那么返回一个行向量;如果是多分类问题,返回一个 m*num_labels 的矩阵,其中每一行表示样本在每个类上的概率。 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :return: 预测概率。 """ if self._thetas is None: raise StateError('not trained yet') x_mat = _t.addones(x_mat) if x_mat.shape[1] - 1 != self.layer_nodes[0]: raise DataNotMatchError( 'feature number and input layer node number mismatch') return _t.ret(_t.c2r(self.__feedforward(self._thetas, x_mat)[-1]))
def cost(self, x_mat: ndarray, y_row: ndarray) -> float: """ 计算在 x_mat 和 y_row 上的代价。 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :param y_row: 输出行向量,每一个值代表 x_mat 中对应行的输出 :return: 代价值。 """ if self._thetas is None: raise StateError('not trained yet') x_mat, y_row = _t.match_x_y(x_mat, y_row) if x_mat.shape[1] - 1 != self.layer_nodes[0]: raise DataNotMatchError( 'feature number and input layer node number mismatch') return self.__cost(self._thetas, x_mat, y_row)
def cost(self, x_mat: ndarray, y_row: ndarray) -> float: """ 计算在 x_mat 和 y_row 上的代价。 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :param y_row: 输出行向量,每一个值代表 x_mat 中对应行的输出 :return: 代价值。 """ if self._root is None: raise StateError('not trained yet') x_mat, y_row = _t.match_x_y(x_mat, y_row, add_ones=False) if x_mat.shape[1] != len(self.pvs): raise DataNotMatchError('feature quantity mismatch') # TODO: 暂时先用错误率作为代价值,以后想想有什么更好的方法 return 1 - _ms.accuracy(self.__predict(self._root, x_mat), y_row)
def predict(self, x_mat: ndarray): """ 使用训练好的参数进行预测。如果提供了特征缩放时的平均值、标准差向量,那么会先将特征值规范化。 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :return: 预测结果 """ if self._theta_row is None: raise StateError('not trained yet') self._theta_row, x_mat = _t.match_theta_x(self._theta_row, x_mat) # 正规方程法不需要规范化数据 if self.method == 'gradient' and self.mean_row is not None and self.std_row is not None: x_mat = _dp.feature_normalize(x_mat, mean_row=self.mean_row, std_row=self.std_row)[0] return _t.ret(x_mat @ self._theta_row)
def predict(self, x_mat: ndarray) -> Union[ndarray, int]: """ 返回预测值,是对应于 x_mat 的标记。 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :return: 预测标记 """ if self._theta is None: raise StateError('not trained yet') self._theta, x_mat = _t.match_theta_x(self._theta, x_mat) prob = x_mat @ self._theta if len(self.labels) == 2: return _t.ret( _t.convert_y(self.labels, _mf.sigmoid(prob) >= self.threshold, to=False)) else: return _t.ret(self.labels[np.argmax(prob, axis=1)])
def __evaluation(self, x_mat: ndarray, eval: Callable[[IProbabilityLearner], ndarray]): if not self._trained: raise StateError('not trained yet') x_mat = _t.r2m(x_mat) m = x_mat.shape[0] n = len(self._labels) evaluation = np.zeros((m, n), dtype=float) if self.strategy == 'ovr': for i, learner in enumerate(self._learners.values()): evaluation[:, i] = eval(learner) else: for i, positive in enumerate(self.labels): for negative in self.labels: if positive == negative: continue evaluation[:, i] = evaluation[:, i] + eval( self._learners[(positive, negative)]) evaluation[:, i] = evaluation[:, i] / (n - 1) return evaluation
def restore(self, reduced_mat: ndarray) -> ndarray: """ 将数据从降维中还原。 :param reduced_mat: 降维后的矩阵 :return: 还原的矩阵。 """ if self._theta is None: raise StateError('not trained yet') if len(reduced_mat.shape) != 2: raise ValueError('reduced_mat must be a matrix') dimension = reduced_mat.shape[1] if dimension > len(self.labels) - 1: raise ValueError('reduced_mat is not a compressed matrix') if len(self.labels) == 2: return reduced_mat @ np.linalg.pinv( self._theta.reshape((self._theta.shape[0], 1))) else: return reduced_mat @ np.linalg.pinv(self._theta[:, :dimension])
def predict(self, x_mat: ndarray): """ 返回预测值,是对应于 x_mat 的标记。 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :return: 预测标记 """ if self._theta is None: raise StateError('not trained yet') x_mat = _t.r2m(x_mat) if x_mat.shape[1] != self._theta.shape[0]: raise DataNotMatchError('feature quantity mismatch') if self.kernel == 'linear': pred = x_mat @ self._theta + self._b elif self.kernel == 'gauss': m = x_mat.shape[0] pred = np.empty((m, )) for i in range(m): for j in range(self._x_mat.shape[0]): pred[i] = pred[i] + self._alphas[ j] * self._y_row[j] * gaussian_kernel( x_mat[i], self._x_mat[j], self.gamma) pred[i] = pred[i] + self._b else: m = x_mat.shape[0] pred = np.empty((m, )) for i in range(m): for j in range(self._x_mat.shape[0]): pred[i] = pred[i] + self._alphas[j] * self._y_row[ j] * self.kernel(x_mat[i], self._x_mat[j]) pred[i] = pred[i] + self._b return _t.ret( _t.convert_y(self.labels, (pred >= 0).astype(dtype=np.int16), to=False))
def predict(self, x_mat: ndarray) -> Union[ndarray, int]: """ 返回预测值,是对应于 x_mat 的标记。 :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数 :return: 预测标记 """ if self._thetas is None: raise StateError('not trained yet') x_mat = _t.addones(x_mat) if x_mat.shape[1] - 1 != self.layer_nodes[0]: raise DataNotMatchError( 'feature number and input layer node number mismatch') a = self.__feedforward(self._thetas, x_mat)[-1] if len(self.labels) == 2: return _t.ret( _t.convert_y(self.labels, _t.c2r(a >= self.threshold), to=False)) else: return _t.ret(self.labels[np.argmax(a, axis=1)])