Пример #1
0
    def predict(self, x_mat: ndarray) -> Union[ndarray, int]:
        """
        返回预测值,是对应于 x_mat 的标记。

        :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
        :return: 预测标记
        """

        if not self._trained:
            raise StateError('not trained yet')

        x_mat = _t.r2m(x_mat)
        pred = self.__predict(x_mat)

        if self.strategy == 'ovr':
            return _t.ret(pred)
        else:
            m = x_mat.shape[0]
            result = np.empty((m, ))
            for i in range(m):
                r = pred[:, i]
                result[i] = sorted([(np.sum(r == label), label)
                                    for label in set(r)])[-1][1]

            return _t.ret(result)
Пример #2
0
    def cost(self, x_mat: ndarray, y_row: ndarray) -> float:
        """
        计算在 x_mat 和 y_row 上的代价。

        :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
        :param y_row: 输出行向量,每一个值代表 x_mat 中对应行的输出
        :return: 代价值。
        """

        if self._theta is None:
            raise StateError('not trained yet')

        self._theta, x_mat = _t.match_theta_x(self._theta, x_mat)
        x_mat, y_row = _t.match_x_y(x_mat, y_row)

        if len(self.labels) == 2:
            return self.__cost(self._theta, x_mat, y_row)
        else:
            m = x_mat.shape[0]
            cost_sum = 0
            for i, label in enumerate(self.labels):
                y = y_row == label
                cost_sum = cost_sum + np.sum(y) * self.__cost(
                    self._theta[:, i], x_mat, y) / m

            return cost_sum
Пример #3
0
def extract7z(src_archive, extract_dir, progress=None, readExtensions=None,
              recursive=False, filelist_to_extract=None):
    command = _extract_command(src_archive, extract_dir, recursive,
                               filelist_to_extract)
    proc = subprocess.Popen(command, stdout=subprocess.PIPE, bufsize=1,
                            stdin=subprocess.PIPE, startupinfo=startupinfo)
    # Error checking, progress feedback and subArchives for recursive unpacking
    index, errorLine, subArchives = 0, u'', []
    source_archive = src_archive.tail.s
    with proc.stdout as out:
        for line in iter(out.readline, b''):
            line = unicode(line, 'utf8')
            if regErrMatch(line):
                errorLine = line + u''.join(out)
                break
            maExtracting = regExtractMatch(line)
            if maExtracting:
                extracted = GPath(maExtracting.group(1).strip())
                if readExtensions and extracted.cext in readExtensions:
                    subArchives.append(extracted)
                if not progress: continue
                progress(index, source_archive + u'\n' + _(
                    u'Extracting files...') + u'\n' + extracted.s)
                index += 1
    returncode = proc.wait()
    if returncode or errorLine:
        raise StateError(u'%s: Extraction failed:\n7z.exe return value: %s\n%s'
                         % (source_archive, str(returncode), errorLine))
    return subArchives
Пример #4
0
def compress7z(command, outDir, destArchive, srcDir, progress=None):
    outFile = outDir.join(destArchive)
    if progress is not None: #--Used solely for the progress bar
        length = sum([len(files) for x, y, files in walkdir(srcDir.s)])
        progress(0, destArchive.s + u'\n' + _(u'Compressing files...'))
        progress.setFull(1 + length)
    #--Pack the files
    proc = subprocess.Popen(command, stdout=subprocess.PIPE, bufsize=1,
                            stdin=subprocess.PIPE, # needed for some commands
                            startupinfo=startupinfo)
    #--Error checking and progress feedback
    index, errorLine = 0, u''
    with proc.stdout as out:
        for line in iter(out.readline, b''):
            line = unicode(line, 'utf8') # utf-8 is ok see bosh.compressCommand
            if regErrMatch(line):
                errorLine = line + u''.join(out)
                break
            if progress is None: continue
            maCompressing = regCompressMatch(line)
            if maCompressing:
                progress(index, destArchive.s + u'\n' + _(
                    u'Compressing files...') + u'\n' + maCompressing.group(
                    1).strip())
                index += 1
    returncode = proc.wait()
    if returncode or errorLine:
        outFile.temp.remove()
        raise StateError(destArchive.s + u': Compression failed:\n' +
                u'7z.exe return value: ' + str(returncode) + u'\n' + errorLine)
    #--Finalize the file, and cleanup
    outFile.untemp()
Пример #5
0
def wrapPopenOut(command, wrapper, errorMsg):
    proc = subprocess.Popen(command, stdout=subprocess.PIPE, bufsize=-1,
                            stdin=subprocess.PIPE, startupinfo=startupinfo)
    out, unused_err = proc.communicate()
    wrapper(out)
    returncode = proc.returncode
    if returncode:
        raise StateError(errorMsg + u'\nPopen return value: %d' + returncode)
Пример #6
0
    def cost(self, x_mat: ndarray, y_row: ndarray) -> float:
        """
        计算在 x_mat 和 y_row 上的代价。此方法只计算参与了分类结果的学习器。

        :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
        :param y_row: 输出行向量,每一个值代表 x_mat 中对应行的输出
        :return: 代价值。
        """

        if not self._trained:
            raise StateError('not trained yet')

        x_mat, y_row = _t.match_x_y(x_mat, y_row, add_ones=False)
        pred = self.__predict(x_mat)
        cost_sum = 0
        m = x_mat.shape[0]

        if self.strategy == 'ovr':
            for i, label in enumerate(pred):
                y = y_row[i]
                if y == label:
                    y = self.positive_label if self.positive_label is not None else label
                else:
                    y = self.negative_label if self.negative_label is not None else label + 1
                cost_sum = cost_sum + self._learners[label].cost(
                    x_mat[i, :], np.array([y]))
        else:
            result = np.empty((m, ))
            for i in range(m):
                r = pred[:, i]
                result[i] = sorted([(np.sum(r == label), label)
                                    for label in set(r)])[-1][1]

            for i in range(m):
                sub_cost_sum = 0
                count = 0
                for j, ((positive, negative),
                        learner) in enumerate(self._learners.items()):
                    if pred[j][i] == result[i]:
                        y = y_row[i]
                        if y == positive:
                            y = self.positive_label if self.positive_label is not None else positive
                        else:
                            y = self.negative_label if self.negative_label is not None else negative
                        sub_cost_sum = sub_cost_sum + learner.cost(
                            x_mat[i, :], np.array([y]))
                        count = count + 1
                if count != 0:
                    cost_sum = cost_sum + sub_cost_sum / count

        return cost_sum / m
Пример #7
0
    def probability(self, x_mat: ndarray) -> Union[ndarray, float]:
        """
        返回对应于 x_mat 的预测概率。如果是二分类问题,那么返回一个行向量;如果是多分类问题,返回一个
        m*num_labels 的矩阵,其中每一行表示样本在每个类上的概率。

        :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
        :return: 预测概率。
        """

        if self._theta is None:
            raise StateError('not trained yet')

        self._theta, x_mat = _t.match_theta_x(self._theta, x_mat)

        return _mf.sigmoid(x_mat @ self._theta)
Пример #8
0
    def cost(self, x_mat: ndarray, y_row: ndarray):
        """
        计算在 x_mat 和 y_row 上的代价。

        :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
        :param y_row: 输出行向量,每一个值代表 x_mat 中对应行的输出
        :return: 代价值
        """

        if self._theta_row is None:
            raise StateError('not trained yet')

        self._theta_row, x_mat = _t.match_theta_x(self._theta_row, x_mat)
        x_mat, y_row = _t.match_x_y(x_mat, y_row)

        return self.__cost(self._theta_row, x_mat, y_row)
Пример #9
0
    def __match_theta_x(self, x_mat: ndarray) -> ndarray:
        """
        检查输入是否和参数匹配。

        :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
        :return:
        """

        if self._theta is None:
            raise StateError('not trained yet')

        x_mat = _t.r2m(x_mat)
        if x_mat.shape[1] != self._theta.shape[0]:
            raise DataNotMatchError('feature quantity mismatch')

        return x_mat
Пример #10
0
    def predict(self, x_mat: ndarray) -> Union[ndarray, int]:
        """
        对新的数据进行预测。当 x_mat 中只有一个样本时返回一个数字,否则返回一个行向量。

        :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
        :return: 预测值或向量
        """

        if self._root is None:
            raise StateError('not trained yet')

        x_mat = _t.r2m(x_mat)
        if x_mat.shape[1] != len(self.pvs):
            raise DataNotMatchError('feature quantity mismatch')

        return self.__predict(self._root, x_mat)
Пример #11
0
    def probability(self, x_mat: ndarray) -> Union[ndarray, float]:
        """
        返回对应于 x_mat 的预测概率。如果是二分类问题,那么返回一个行向量;如果是多分类问题,返回一个
        m*num_labels 的矩阵,其中每一行表示样本在每个类上的概率。

        :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
        :return: 预测概率。
        """

        if self._thetas is None:
            raise StateError('not trained yet')

        x_mat = _t.addones(x_mat)
        if x_mat.shape[1] - 1 != self.layer_nodes[0]:
            raise DataNotMatchError(
                'feature number and input layer node number mismatch')

        return _t.ret(_t.c2r(self.__feedforward(self._thetas, x_mat)[-1]))
Пример #12
0
    def cost(self, x_mat: ndarray, y_row: ndarray) -> float:
        """
        计算在 x_mat 和 y_row 上的代价。

        :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
        :param y_row: 输出行向量,每一个值代表 x_mat 中对应行的输出
        :return: 代价值。
        """

        if self._thetas is None:
            raise StateError('not trained yet')

        x_mat, y_row = _t.match_x_y(x_mat, y_row)
        if x_mat.shape[1] - 1 != self.layer_nodes[0]:
            raise DataNotMatchError(
                'feature number and input layer node number mismatch')

        return self.__cost(self._thetas, x_mat, y_row)
Пример #13
0
    def cost(self, x_mat: ndarray, y_row: ndarray) -> float:
        """
        计算在 x_mat 和 y_row 上的代价。

        :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
        :param y_row: 输出行向量,每一个值代表 x_mat 中对应行的输出
        :return: 代价值。
        """

        if self._root is None:
            raise StateError('not trained yet')

        x_mat, y_row = _t.match_x_y(x_mat, y_row, add_ones=False)
        if x_mat.shape[1] != len(self.pvs):
            raise DataNotMatchError('feature quantity mismatch')

        # TODO: 暂时先用错误率作为代价值,以后想想有什么更好的方法
        return 1 - _ms.accuracy(self.__predict(self._root, x_mat), y_row)
Пример #14
0
    def predict(self, x_mat: ndarray):
        """
        使用训练好的参数进行预测。如果提供了特征缩放时的平均值、标准差向量,那么会先将特征值规范化。

        :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
        :return: 预测结果
        """

        if self._theta_row is None:
            raise StateError('not trained yet')

        self._theta_row, x_mat = _t.match_theta_x(self._theta_row, x_mat)
        # 正规方程法不需要规范化数据
        if self.method == 'gradient' and self.mean_row is not None and self.std_row is not None:
            x_mat = _dp.feature_normalize(x_mat,
                                          mean_row=self.mean_row,
                                          std_row=self.std_row)[0]

        return _t.ret(x_mat @ self._theta_row)
Пример #15
0
    def predict(self, x_mat: ndarray) -> Union[ndarray, int]:
        """
        返回预测值,是对应于 x_mat 的标记。

        :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
        :return: 预测标记
        """

        if self._theta is None:
            raise StateError('not trained yet')

        self._theta, x_mat = _t.match_theta_x(self._theta, x_mat)
        prob = x_mat @ self._theta

        if len(self.labels) == 2:
            return _t.ret(
                _t.convert_y(self.labels,
                             _mf.sigmoid(prob) >= self.threshold,
                             to=False))
        else:
            return _t.ret(self.labels[np.argmax(prob, axis=1)])
Пример #16
0
    def __evaluation(self, x_mat: ndarray,
                     eval: Callable[[IProbabilityLearner], ndarray]):
        if not self._trained:
            raise StateError('not trained yet')

        x_mat = _t.r2m(x_mat)
        m = x_mat.shape[0]
        n = len(self._labels)
        evaluation = np.zeros((m, n), dtype=float)

        if self.strategy == 'ovr':
            for i, learner in enumerate(self._learners.values()):
                evaluation[:, i] = eval(learner)
        else:
            for i, positive in enumerate(self.labels):
                for negative in self.labels:
                    if positive == negative:
                        continue
                    evaluation[:, i] = evaluation[:, i] + eval(
                        self._learners[(positive, negative)])
                evaluation[:, i] = evaluation[:, i] / (n - 1)

        return evaluation
Пример #17
0
    def restore(self, reduced_mat: ndarray) -> ndarray:
        """
        将数据从降维中还原。

        :param reduced_mat: 降维后的矩阵
        :return: 还原的矩阵。
        """

        if self._theta is None:
            raise StateError('not trained yet')

        if len(reduced_mat.shape) != 2:
            raise ValueError('reduced_mat must be a matrix')

        dimension = reduced_mat.shape[1]
        if dimension > len(self.labels) - 1:
            raise ValueError('reduced_mat is not a compressed matrix')

        if len(self.labels) == 2:
            return reduced_mat @ np.linalg.pinv(
                self._theta.reshape((self._theta.shape[0], 1)))
        else:
            return reduced_mat @ np.linalg.pinv(self._theta[:, :dimension])
Пример #18
0
    def predict(self, x_mat: ndarray):
        """
        返回预测值,是对应于 x_mat 的标记。

        :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
        :return: 预测标记
        """

        if self._theta is None:
            raise StateError('not trained yet')
        x_mat = _t.r2m(x_mat)
        if x_mat.shape[1] != self._theta.shape[0]:
            raise DataNotMatchError('feature quantity mismatch')

        if self.kernel == 'linear':
            pred = x_mat @ self._theta + self._b
        elif self.kernel == 'gauss':
            m = x_mat.shape[0]
            pred = np.empty((m, ))
            for i in range(m):
                for j in range(self._x_mat.shape[0]):
                    pred[i] = pred[i] + self._alphas[
                        j] * self._y_row[j] * gaussian_kernel(
                            x_mat[i], self._x_mat[j], self.gamma)
                pred[i] = pred[i] + self._b
        else:
            m = x_mat.shape[0]
            pred = np.empty((m, ))
            for i in range(m):
                for j in range(self._x_mat.shape[0]):
                    pred[i] = pred[i] + self._alphas[j] * self._y_row[
                        j] * self.kernel(x_mat[i], self._x_mat[j])
                pred[i] = pred[i] + self._b

        return _t.ret(
            _t.convert_y(self.labels, (pred >= 0).astype(dtype=np.int16),
                         to=False))
Пример #19
0
    def predict(self, x_mat: ndarray) -> Union[ndarray, int]:
        """
        返回预测值,是对应于 x_mat 的标记。

        :param x_mat: 特征向量组,行数 m 表示样本数,列数 n 表示特征数
        :return: 预测标记
        """

        if self._thetas is None:
            raise StateError('not trained yet')

        x_mat = _t.addones(x_mat)
        if x_mat.shape[1] - 1 != self.layer_nodes[0]:
            raise DataNotMatchError(
                'feature number and input layer node number mismatch')

        a = self.__feedforward(self._thetas, x_mat)[-1]
        if len(self.labels) == 2:
            return _t.ret(
                _t.convert_y(self.labels,
                             _t.c2r(a >= self.threshold),
                             to=False))
        else:
            return _t.ret(self.labels[np.argmax(a, axis=1)])