Exemplo n.º 1
0
    def train(self, trainSet: InstanceList, parameters: Parameter):
        """
        Training algorithm for the linear discriminant analysis classifier (Introduction to Machine Learning, Alpaydin,
        2015).

        PARAMETERS
        ----------
        trainSet : InstanceList
            Training data given to the algorithm.
        parameters : Parameter
            Parameter of the Lda algorithm.
        """
        w0 = {}
        w = {}
        priorDistribution = trainSet.classDistribution()
        classLists = Partition(trainSet)
        covariance = Matrix(trainSet.get(0).continuousAttributeSize(), trainSet.get(0).continuousAttributeSize())
        for i in range(classLists.size()):
            averageVector = Vector(classLists.get(i).continuousAverage())
            classCovariance = classLists.get(i).covariance(averageVector)
            classCovariance.multiplyWithConstant(classLists.get(i).size() - 1)
            covariance.add(classCovariance)
        covariance.divideByConstant(trainSet.size() - classLists.size())
        covariance.inverse()
        for i in range(classLists.size()):
            Ci = classLists.get(i).getClassLabel()
            averageVector = Vector(classLists.get(i).continuousAverage())
            wi = covariance.multiplyWithVectorFromRight(averageVector)
            w[Ci] = wi
            w0i = -0.5 * wi.dotProduct(averageVector) + math.log(priorDistribution.getProbability(Ci))
            w0[Ci] = w0i
        self.model = LdaModel(priorDistribution, w, w0)
Exemplo n.º 2
0
    def __linearRegressionOnCountsOfCounts(self, countsOfCounts: list) -> list:
        """
        Given counts of counts, this function will calculate the estimated counts of counts c$^*$ with
        Good-Turing smoothing. First, the algorithm filters the non-zero counts from counts of counts array and constructs
        c and r arrays. Then it constructs Z_n array with Z_n = (2C_n / (r_{n+1} - r_{n-1})). The algorithm then uses
        simple linear regression on Z_n values to estimate w_1 and w_0, where log(N[i]) = w_1log(i) + w_0

        PARAMETERS
        ----------
        countsOfCounts : list
            Counts of counts. countsOfCounts[1] is the number of words occurred once in the corpus. countsOfCounts[i] is
            the number of words occurred i times in the corpus.

        RETURNS
        ------
        list
            Estimated counts of counts array. N[1] is the estimated count for out of vocabulary words.
        """
        N = [0.0] * len(countsOfCounts)
        r = []
        c = []
        for i in range(1, len(countsOfCounts)):
            if countsOfCounts[i] != 0:
                r.append(i)
                c.append(countsOfCounts[i])
        A = Matrix(2, 2)
        y = Vector(2, 0)
        for i in range(len(r)):
            xt = math.log(r[i])
            if i == 0:
                rt = math.log(c[i])
            else:
                if i == len(r) - 1:
                    rt = math.log((1.0 * c[i]) / (r[i] - r[i - 1]))
                else:
                    rt = math.log((2.0 * c[i]) / (r[i + 1] - r[i - 1]))
            A.addValue(0, 0, 1.0)
            A.addValue(0, 1, xt)
            A.addValue(1, 0, xt)
            A.addValue(1, 1, xt * xt)
            y.addValue(0, rt)
            y.addValue(1, rt * xt)
        A.inverse()
        w = A.multiplyWithVectorFromRight(y)
        w0 = w.getValue(0)
        w1 = w.getValue(1)
        for i in range(1, len(countsOfCounts)):
            N[i] = math.exp(math.log(i) * w1 + w0)
        return N
Exemplo n.º 3
0
class MatrixTest(unittest.TestCase):
    def setUp(self):
        self.small = Matrix(3, 3)
        for i in range(3):
            for j in range(3):
                self.small.setValue(i, j, 1.0)
        self.v = Vector(3, 1.0)
        self.large = Matrix(1000, 1000)
        for i in range(1000):
            for j in range(1000):
                self.large.setValue(i, j, 1.0)
        self.medium = Matrix(100, 100)
        for i in range(100):
            for j in range(100):
                self.medium.setValue(i, j, 1.0)
        self.V = Vector(1000, 1.0)
        self.vr = Vector(100, 1.0)
        self.random = Matrix(100, 100, 1, 10, 1)
        self.originalSum = self.random.sumOfElements()
        self.identity = Matrix(100)

    def test_ColumnWiseNormalize(self):
        mClone = self.small.clone()
        mClone.columnWiseNormalize()
        self.assertEqual(3, mClone.sumOfElements())
        MClone = self.large.clone()
        MClone.columnWiseNormalize()
        self.assertAlmostEqual(1000, MClone.sumOfElements(), 3)
        self.identity.columnWiseNormalize()
        self.assertEqual(100, self.identity.sumOfElements())

    def test_MultiplyWithConstant(self):
        self.small.multiplyWithConstant(4)
        self.assertEqual(36, self.small.sumOfElements())
        self.small.divideByConstant(4)
        self.large.multiplyWithConstant(1.001)
        self.assertAlmostEqual(1001000, self.large.sumOfElements(), 3)
        self.large.divideByConstant(1.001)
        self.random.multiplyWithConstant(3.6)
        self.assertAlmostEqual(self.originalSum * 3.6,
                               self.random.sumOfElements(), 4)
        self.random.divideByConstant(3.6)

    def test_DivideByConstant(self):
        self.small.divideByConstant(4)
        self.assertEqual(2.25, self.small.sumOfElements())
        self.small.multiplyWithConstant(4)
        self.large.divideByConstant(10)
        self.assertAlmostEqual(100000, self.large.sumOfElements(), 3)
        self.large.multiplyWithConstant(10)
        self.random.divideByConstant(3.6)
        self.assertAlmostEqual(self.originalSum / 3.6,
                               self.random.sumOfElements(), 4)
        self.random.multiplyWithConstant(3.6)

    def test_Add(self):
        self.random.add(self.identity)
        self.assertAlmostEqual(self.originalSum + 100,
                               self.random.sumOfElements(), 4)
        self.random.subtract(self.identity)

    def test_AddVector(self):
        self.large.addRowVector(4, self.V)
        self.assertEqual(1001000, self.large.sumOfElements(), 0.0)
        self.V.multiply(-1.0)
        self.large.addRowVector(4, self.V)
        self.V.multiply(-1.0)

    def test_Subtract(self):
        self.random.subtract(self.identity)
        self.assertAlmostEqual(self.originalSum - 100,
                               self.random.sumOfElements(), 4)
        self.random.add(self.identity)

    def test_MultiplyWithVectorFromLeft(self):
        result = self.small.multiplyWithVectorFromLeft(self.v)
        self.assertEqual(9, result.sumOfElements())
        result = self.large.multiplyWithVectorFromLeft(self.V)
        self.assertEqual(1000000, result.sumOfElements())
        result = self.random.multiplyWithVectorFromLeft(self.vr)
        self.assertAlmostEqual(self.originalSum, result.sumOfElements(), 4)

    def test_MultiplyWithVectorFromRight(self):
        result = self.small.multiplyWithVectorFromRight(self.v)
        self.assertEqual(9, result.sumOfElements())
        result = self.large.multiplyWithVectorFromRight(self.V)
        self.assertEqual(1000000, result.sumOfElements())
        result = self.random.multiplyWithVectorFromRight(self.vr)
        self.assertAlmostEqual(self.originalSum, result.sumOfElements(), 4)

    def test_ColumnSum(self):
        self.assertEqual(3, self.small.columnSum(randrange(3)))
        self.assertEqual(1000, self.large.columnSum(randrange(1000)))
        self.assertEqual(1, self.identity.columnSum(randrange(100)))

    def test_SumOfRows(self):
        self.assertEqual(9, self.small.sumOfRows().sumOfElements())
        self.assertEqual(1000000, self.large.sumOfRows().sumOfElements())
        self.assertEqual(100, self.identity.sumOfRows().sumOfElements())
        self.assertAlmostEqual(self.originalSum,
                               self.random.sumOfRows().sumOfElements(), 3)

    def test_RowSum(self):
        self.assertEqual(3, self.small.rowSum(randrange(3)))
        self.assertEqual(1000, self.large.rowSum(randrange(1000)))
        self.assertEqual(1, self.identity.rowSum(randrange(100)))

    def test_Multiply(self):
        result = self.small.multiply(self.small)
        self.assertEqual(27, result.sumOfElements())
        result = self.medium.multiply(self.medium)
        self.assertEqual(1000000.0, result.sumOfElements())
        result = self.random.multiply(self.identity)
        self.assertEqual(self.originalSum, result.sumOfElements())
        result = self.identity.multiply(self.random)
        self.assertEqual(self.originalSum, result.sumOfElements())

    def test_ElementProduct(self):
        result = self.small.elementProduct(self.small)
        self.assertEqual(9, result.sumOfElements())
        result = self.large.elementProduct(self.large)
        self.assertEqual(1000000, result.sumOfElements())
        result = self.random.elementProduct(self.identity)
        self.assertEqual(result.trace(), result.sumOfElements())

    def test_SumOfElements(self):
        self.assertEqual(9, self.small.sumOfElements())
        self.assertEqual(1000000, self.large.sumOfElements())
        self.assertEqual(100, self.identity.sumOfElements())
        self.assertEqual(self.originalSum, self.random.sumOfElements())

    def test_Trace(self):
        self.assertEqual(3, self.small.trace())
        self.assertEqual(1000, self.large.trace())
        self.assertEqual(100, self.identity.trace())

    def test_Transpose(self):
        self.assertEqual(9, self.small.transpose().sumOfElements())
        self.assertEqual(1000000, self.large.transpose().sumOfElements())
        self.assertEqual(100, self.identity.transpose().sumOfElements())
        self.assertAlmostEqual(self.originalSum,
                               self.random.transpose().sumOfElements(), 3)

    def test_IsSymmetric(self):
        self.assertTrue(self.small.isSymmetric())
        self.assertTrue(self.large.isSymmetric())
        self.assertTrue(self.identity.isSymmetric())
        self.assertFalse(self.random.isSymmetric())

    def test_Determinant(self):
        self.assertEqual(0, self.small.determinant())
        self.assertEqual(0, self.large.determinant())
        self.assertEqual(1, self.identity.determinant())

    def test_Inverse(self):
        self.identity.inverse()
        self.assertEqual(100, self.identity.sumOfElements())
        self.random.inverse()
        self.random.inverse()
        self.assertAlmostEqual(self.originalSum, self.random.sumOfElements(),
                               5)

    def test_Characteristics(self):
        vectors = self.small.characteristics()
        self.assertEqual(2, len(vectors))
        vectors = self.identity.characteristics()
        self.assertEqual(100, len(vectors))
        vectors = self.medium.characteristics()
        self.assertEqual(46, len(vectors))