Exemplo n.º 1
0
    def setUp(self):
        self.dir_ = data_dir + "/space_test_resources/"
        self.init_test_cases = [(DenseMatrix(np.array([[1,2],[3,4]])),
                       ["car", "man"],
                       ["feat1", "feat2"],
                       {"man":1, "car":0},
                       {"feat1":0, "feat2":1},
                       [ScalingOperation(EpmiWeighting())]),
                      (DenseMatrix(np.array([[1,2],[3,4]])),
                       ["car", "man"],
                       [],
                       {"man":1, "car":0},
                       {},
                       [ScalingOperation(EpmiWeighting())])]

        self.m1 = np.array([[1,2,3]])
        self.row1 = ["a"]
        self.row2 = ["a", "b", "c"]
        self.ft1 = ["f1","f2","f3"]
        self.space1 = Space(DenseMatrix(self.m1),self.row1, self.ft1)

        self.x = np.mat([[1,2,3],[2,4,6],[4,675,43]])
        self.us = np.mat([[  2.19272110e+00,   3.03174768e+00],
                               [  4.38544220e+00,   6.06349536e+00],
                               [  6.76369708e+02,  -4.91431927e-02]])
        self.space2 = Space(DenseMatrix(self.x), self.row2, self.ft1)
Exemplo n.º 2
0
    def test_top_feat_selection(self):
        test_cases = [
            (self.a, np.mat([[3, 1], [5, 4]]), [2, 0], 2),
            (self.a, np.mat([[3], [5]]), [2], 1),
            (self.a, np.mat([[3, 1, 2], [5, 4, 0]]), [2, 0, 1], 6),
        ]

        for in_mat, expected_mat, expected_perm, no_cols in test_cases:
            fs = TopFeatureSelection(no_cols)

            out_mat, perm = fs.apply(DenseMatrix(in_mat))
            np.testing.assert_array_equal(out_mat.mat, expected_mat)
            self.assertListEqual(perm, expected_perm)

            out_mat, perm = fs.apply(SparseMatrix(in_mat))
            np.testing.assert_array_equal(out_mat.mat.todense(), expected_mat)
            self.assertListEqual(perm, expected_perm)

            fs = TopFeatureSelection(no_cols, criterion="length")

            out_mat, perm = fs.apply(DenseMatrix(in_mat))
            np.testing.assert_array_equal(out_mat.mat, expected_mat)
            self.assertListEqual(perm, expected_perm)

            out_mat, perm = fs.apply(SparseMatrix(in_mat))
            np.testing.assert_array_equal(out_mat.mat.todense(), expected_mat)
            self.assertListEqual(perm, expected_perm)

        self.assertRaises(ValueError, TopFeatureSelection, 0)
        self.assertRaises(ValueError,
                          TopFeatureSelection,
                          2,
                          criterion="something")
Exemplo n.º 3
0
    def test_trivial_crossvalidation(self):

        for i in range(1, 10):
            m_a = DenseMatrix(np.mat(np.random.random((i + 1, 4))))
            m_b = DenseMatrix(np.mat(np.random.random((i + 1, 4))))
            tmp_a = m_a.mat.copy()
            tmp_b = m_b.mat.copy()

            learner = RidgeRegressionLearner(param_range=[3], intercept=False)
            solution = learner.train(m_a, m_b)

            learner2 = RidgeRegressionLearner(param=3, intercept=False)
            solution2 = learner2.train(m_a, m_b)

            np.testing.assert_array_equal(tmp_a, m_a.mat)
            np.testing.assert_array_equal(tmp_b, m_b.mat)
            np.testing.assert_array_equal(solution.mat, solution2.mat)

            learner = RidgeRegressionLearner(param_range=[3], intercept=False)
            solution = learner.train(m_a, m_b)

            np.testing.assert_array_equal(tmp_a, m_a.mat)
            np.testing.assert_array_equal(tmp_b, m_b.mat)
            np.testing.assert_array_equal(solution.mat, solution2.mat)

            learner = RidgeRegressionLearner(param_range=[0], intercept=False)
            solution = learner.train(m_a, m_b)

            learner2 = LstsqRegressionLearner(intercept=False)
            solution2 = learner2.train(m_a, m_b)

            np.testing.assert_array_almost_equal(solution.mat, solution2.mat,
                                                 3)
Exemplo n.º 4
0
    def test_init_svd(self):
        test_cases = [(self.space2, self.us, self.us2, self.x, self.row3)]
        red1 = Svd(2)
        red2 = Svd(1)

        for in_s, expected_mat, expected_mat2, data, rows in test_cases:
            in_s = in_s.apply(red1)
            per_s = PeripheralSpace(in_s, DenseMatrix(data), rows)

            np.testing.assert_array_almost_equal(expected_mat,
                                                 per_s.cooccurrence_matrix.mat,
                                                 2)
            self.assertListEqual(per_s.id2row, in_s.id2row)
            self.assertListEqual(per_s.id2column, [])
            self.assertDictEqual(per_s.row2id, in_s.row2id)
            self.assertDictEqual(per_s.column2id, {})
            self.assertEqual(1, len(per_s.operations))

            in_s = in_s.apply(red2)
            per_s = PeripheralSpace(in_s, DenseMatrix(data), rows)

            np.testing.assert_array_almost_equal(expected_mat2,
                                                 per_s.cooccurrence_matrix.mat,
                                                 2)
            self.assertListEqual(per_s.id2row, in_s.id2row)
            self.assertListEqual(per_s.id2column, [])
            self.assertDictEqual(per_s.row2id, in_s.row2id)
            self.assertDictEqual(per_s.column2id, {})
            self.assertEqual(2, len(per_s.operations))
Exemplo n.º 5
0
    def test_space_compose_dense(self):

        test_cases = [
            ([("a", "b", "a_b")], self.space4, self.space5,
             DenseMatrix.identity(2), DenseMatrix.identity(2)),
            ([("a", "b", "a_b")], self.space4, self.space6,
             np.mat([[0, 0], [0, 0]]), np.mat([[0, 0], [0, 0]])),
            ([("a", "b", "a_b"), ("a", "b", "a_a")], self.space4, self.space7,
             DenseMatrix.identity(2), DenseMatrix.identity(2)),
        ]

        for in_data, arg_space, phrase_space, mat_a, mat_b in test_cases:
            comp_model = FullAdditive(A=mat_a, B=mat_b)
            comp_space = comp_model.compose(in_data, arg_space)

            np.testing.assert_array_almost_equal(
                comp_space.cooccurrence_matrix.mat,
                phrase_space.cooccurrence_matrix.mat, 10)

            self.assertListEqual(comp_space.id2column, [])
            self.assertDictEqual(comp_space.column2id, {})

            self.assertListEqual(comp_space.id2row, phrase_space.id2row)
            self.assertDictEqual(comp_space.row2id, phrase_space.row2id)

            self.assertFalse(comp_model._has_intercept)
Exemplo n.º 6
0
    def setUp(self):
        self.ft = ["f1", "f2"]

        self.n_space = Space(DenseMatrix(np.mat([[3, 4], [5, 6]])),
                             ["car", "man"], self.ft)
        self.an_space = Space(DenseMatrix(np.mat([[3, 4], [5, 6]])),
                              ["a1_car", "a1_man"], self.ft)
Exemplo n.º 7
0
    def test_dense_lstsq_regression(self):

        test_cases = self.pinv_test_cases
        for m, m_inv in test_cases:
            m1 = DenseMatrix(m)
            id_ = DenseMatrix.identity(m1.shape[0])

            res = Linalg.lstsq_regression(m1, id_)
            np.testing.assert_array_almost_equal(res.mat, m_inv, 7)
Exemplo n.º 8
0
    def test_train_intercept(self):

        a1_mat = DenseMatrix(np.mat([[3,4],[5,6]]))
        a2_mat = DenseMatrix(np.mat([[1,2],[3,4]]))

        train_data = [("a1", "man", "a1_man"),
                      ("a2", "car", "a2_car"),
                      ("a1", "boy", "a1_boy"),
                      ("a2", "boy", "a2_boy")
                      ]

        n_mat = DenseMatrix(np.mat([[13,21],[3,4],[5,6]]))
        n_space = Space(n_mat, ["man", "car", "boy"], self.ft)

        an1_mat = (a1_mat * n_mat.transpose()).transpose()
        an2_mat = (a2_mat * n_mat.transpose()).transpose()
        an_mat = an1_mat.vstack(an2_mat)

        an_space = Space(an_mat, ["a1_man","a1_car","a1_boy","a2_man","a2_car","a2_boy"], self.ft)

        #test train
        model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True))
        model._MIN_SAMPLES = 1
        model.train(train_data, n_space, an_space)
        a_space = model.function_space

        a1_mat.reshape((1,4))
        #np.testing.assert_array_almost_equal(a1_mat.mat,
        #                                     a_space.cooccurrence_matrix.mat[0])

        a2_mat.reshape((1,4))
        #np.testing.assert_array_almost_equal(a2_mat.mat,
        #                                     a_space.cooccurrence_matrix.mat[1])

        self.assertListEqual(a_space.id2row, ["a1", "a2"])
        self.assertTupleEqual(a_space.element_shape, (2,3))

        #test compose
        a1_mat = DenseMatrix(np.mat([[3,4,5,6]]))
        a2_mat = DenseMatrix(np.mat([[1,2,3,4]]))
        a_mat = a_space.cooccurrence_matrix

        a_space = Space(a_mat, ["a1", "a2"], [], element_shape=(2,3))
        model = LexicalFunction(function_space=a_space, intercept=True)
        model._MIN_SAMPLES = 1
        comp_space = model.compose(train_data, n_space)

        self.assertListEqual(comp_space.id2row, ["a1_man", "a2_car", "a1_boy", "a2_boy"])
        self.assertListEqual(comp_space.id2column, [])

        self.assertEqual(comp_space.element_shape, (2,))

        np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat,
                                             an_mat[[0,4,2,5]].mat, 8)
Exemplo n.º 9
0
    def test_dilation(self):

        self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]]))
        self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]]))
        self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]]))
        self.row = ["a", "b"]
        self.ft = ["f1", "f2"]
        self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
        self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft)
        m = Dilation()
        m.export(self.prefix + ".dil1")
        m.train([("a", "b", "a_b")], self.space1, self.space2)
        m.export(self.prefix + ".dil2")
Exemplo n.º 10
0
    def test_weighted_additive(self):

        self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]]))
        self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]]))
        self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]]))
        self.row = ["a", "b"]
        self.ft = ["f1", "f2"]
        self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
        self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft)
        m = WeightedAdditive()
        m.export(self.prefix + ".add1")
        m.train([("a", "a", "a_a")], self.space1, self.space2)
        m.export(self.prefix + ".add2")
Exemplo n.º 11
0
    def test_init(self):
        test_cases = [(self.space1, self.m2, self.row2, np.array([[2, 0.5,
                                                                   1]]),
                       np.array([[0.69314718, 0, 0]]))]

        w1 = EpmiWeighting()
        w2 = PlogWeighting()

        for core_s, per_mat, per_row, per_mat_out1, per_mat_out2 in test_cases:
            tmp_mat = per_mat.copy()
            tmp_core_mat = core_s.cooccurrence_matrix.mat
            per_s1 = PeripheralSpace(core_s, DenseMatrix(per_mat), per_row)

            np.testing.assert_array_equal(per_s1.cooccurrence_matrix.mat,
                                          tmp_mat)
            self.assert_column_identical(per_s1, core_s)
            self.assertListEqual(per_s1.id2row, per_row)
            self.assertListEqual(per_s1.operations, core_s.operations)

            core_s1 = core_s.apply(w1)
            per_s2 = PeripheralSpace(core_s1, DenseMatrix(per_mat), per_row)
            np.testing.assert_array_almost_equal(
                per_s2.cooccurrence_matrix.mat, per_mat_out1)
            self.assert_column_identical(per_s2, core_s1)
            self.assertListEqual(per_s2.id2row, per_row)
            self.assertListEqual(per_s2.operations, core_s1.operations)
            self.assertEqual(len(per_s2.operations), 1)

            core_s2 = core_s1.apply(w2)
            per_s3 = PeripheralSpace(core_s2, DenseMatrix(per_mat), per_row)
            np.testing.assert_array_almost_equal(
                per_s3.cooccurrence_matrix.mat, per_mat_out2)
            self.assert_column_identical(per_s3, core_s2)
            self.assertListEqual(per_s3.id2row, per_row)
            self.assertListEqual(per_s3.operations, core_s2.operations)
            self.assertEqual(len(per_s3.operations), 2)

            np.testing.assert_array_equal(tmp_core_mat,
                                          core_s.cooccurrence_matrix.mat)

            core_s3 = core_s2
            per_s4 = PeripheralSpace(core_s3, DenseMatrix(per_mat), per_row)
            np.testing.assert_array_almost_equal(
                per_s4.cooccurrence_matrix.mat, per_mat_out2)
            self.assert_column_identical(per_s4, core_s2)
            self.assertListEqual(per_s4.id2row, per_row)
            self.assertListEqual(per_s4.operations, core_s3.operations)
            self.assertEqual(len(per_s4.operations), 2)

            np.testing.assert_array_equal(tmp_core_mat,
                                          core_s.cooccurrence_matrix.mat)
Exemplo n.º 12
0
    def test_train_random(self):
        test_cases = [1.0, 2.0, 3.0]
        rows = 4
        cols = 3
        m1 = np.random.rand(rows, cols)
        m2 = np.random.rand(rows, cols)

        for lambda_ in test_cases:
            m = Dilation(lambda_)
            result_p = m._compose(DenseMatrix(m1), DenseMatrix(m2))

            m = Dilation()
            m._solve(DenseMatrix(m1), DenseMatrix(m2), result_p)
            self.assertAlmostEqual(lambda_, m._lambda)
Exemplo n.º 13
0
    def test_dense_svd(self):
        test_cases = self.svd_test_cases

        for x, u_expected, s_expected, v_expected in test_cases:
            for dim in [2, 3, 6]:
                u, s, v = Linalg.svd(DenseMatrix(x), dim)
                np.testing.assert_array_almost_equal(u.mat, u_expected, 2)
                np.testing.assert_array_almost_equal(s, s_expected, 2)
                np.testing.assert_array_almost_equal(v.mat, v_expected, 2)

            u, s, v = Linalg.svd(DenseMatrix(x), 1)
            np.testing.assert_array_almost_equal(u.mat, u_expected[:, 0:1], 2)
            np.testing.assert_array_almost_equal(s, s_expected[0:1], 2)
            np.testing.assert_array_almost_equal(v.mat, v_expected[:, 0:1], 2)
Exemplo n.º 14
0
    def test_intercept_lstsq_regression(self):

        a = DenseMatrix(np.matrix([[1, 1],[2, 3],[4, 6]]))
        b = DenseMatrix(np.matrix([[12, 15, 18],[21, 27, 33],[35, 46, 57]]))
        res = DenseMatrix(np.matrix([[1, 2, 3],[4, 5, 6],[7, 8, 9]]))

        res1 = Linalg.lstsq_regression(a, b)
        res2 = Linalg.lstsq_regression(a, b, intercept=True)

        np.testing.assert_array_almost_equal(res2.mat[:-1,:], res[0:2,:].mat, 6)
        np.testing.assert_array_almost_equal(res2.mat[-1,:], res[2:3,:].mat, 6)

        new_a = a.hstack(DenseMatrix(np.ones((a.shape[0], 1))))
        self.assertGreater(((a * res1) - b).norm(), ((new_a * res2) - b).norm())
Exemplo n.º 15
0
    def setUp(self):
        self.a = np.array([[1,2,3],[4,0,5]])
        self.b = np.array([[0,0,0],[0,0,0]])

        self.c = np.array([[0,0],[0,0],[0,0]])
        self.d = np.array([[1,0],[0,1]])
        self.e = np.array([1,10])
        self.f = np.array([1,10,100])

        self.matrix_a = DenseMatrix(self.a)
        self.matrix_b = DenseMatrix(self.b)

        self.matrix_c = DenseMatrix(self.c)
        self.matrix_d = DenseMatrix(self.d)
Exemplo n.º 16
0
    def test_vstack_raises(self):

        space3 = Space(DenseMatrix(self.x[0:2,0:1]), ["e","f"], self.ft1[0:1])
        space4 = Space(DenseMatrix(self.x[0:2,:]), ["a","f"], self.ft1)
        space5 = Space(DenseMatrix(self.x[0:2,:]), ["e","f"], [])
        space6 = Space(DenseMatrix(self.x[0:2,:]), ["e","f"], ["f1","f2","f4"])

        test_cases = [(self.space2, space3),
                      (self.space2, space4),
                      (self.space2, space5),
                      (self.space2, space6)
                      ]

        for space1, space2 in test_cases:
            self.assertRaises(ValueError, space1.vstack, space1, space2)
Exemplo n.º 17
0
    def test_lexical_function(self):

        self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]]))
        self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]]))
        self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]]))
        self.row = ["a", "b"]
        self.ft = ["f1", "f2"]
        self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
        self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft)
        m = LexicalFunction()
        m._MIN_SAMPLES = 1
        self.assertRaises(IllegalStateError, m.export, self.prefix + ".lf1")
        m.train([("a", "b", "a_b"), ("a", "a", "a_a")], self.space1,
                self.space2)
        m.export(self.prefix + ".lf2")
Exemplo n.º 18
0
    def test_full_additive(self):

        self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]]))
        self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]]))
        self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]]))
        self.row = ["a", "b"]
        self.ft = ["f1", "f2"]
        self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
        self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft)
        m = FullAdditive()
        self.assertRaises(IllegalStateError, m.export, self.prefix + ".full1")
        m.train([("a", "b", "a_b"), ("a", "a", "a_a")], self.space1,
                self.space2)

        m.export(self.prefix + ".full2")
Exemplo n.º 19
0
    def _export(self, filename):
        if self._mat_a_t is None or self._mat_b_t is None:
            raise IllegalStateError("cannot export an untrained FullAdditive model.")

        with open(filename, "w") as output_stream:
            output_stream.write("A\n")
            output_stream.write(str(DenseMatrix(self._mat_a_t).mat.T))
            output_stream.write("\nB\n")

            if self._has_intercept:
                output_stream.write(str(DenseMatrix(self._mat_b_t[:-1,]).mat.T))
                output_stream.write("\nIntercept\n")
                output_stream.write(str(DenseMatrix(self._mat_b_t[-1,]).mat.T))
            else:
                output_stream.write(str(DenseMatrix(self._mat_b_t).mat.T))
Exemplo n.º 20
0
def main():
    """
    Convert temporal referencing matrix to regular (binned) matrix.
    """

    # Get the arguments
    args = docopt(
        """Convert temporal referencing matrix to regular (binned) matrix.

    Usage:
        tr2bin.py (-w | -s) <spacePrefix> <ref> <outPath>

        <spacePrefix> = path to pickled space without suffix
        <ref> = reference string
        <outPath> = output path for result file

    Options:
        -w, --w2v   save in w2v format
        -s, --sps   save in sparse matrix format
        
    """)

    is_w2v = args['--w2v']
    is_sps = args['--sps']
    spacePrefix = args['<spacePrefix>']
    ref = args['<ref>']
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load spaces
    space = load_pkl_files(spacePrefix)
    matrix = space.get_cooccurrence_matrix().get_mat()
    id2row = space.get_id2row()
    id2column = space.get_id2column()

    ti = [(spl[0], i) for i, w in enumerate(id2row) for spl in [w.split('_')]
          if len(spl) == 1 or (len(spl) == 2 and spl[1] == ref)]
    targets, indices = zip(*ti)

    new_matrix = matrix[list(indices), :]

    # Save the Space objects
    if is_w2v:
        new_space = Space(DenseMatrix(new_matrix), list(targets), id2column)
        save_pkl_files(new_space,
                       outPath,
                       save_in_one_file=True,
                       save_as_w2v=True)
    if is_sps:
        new_space = Space(SparseMatrix(new_matrix), list(targets), id2column)
        save_pkl_files(new_space,
                       outPath,
                       save_in_one_file=True,
                       save_as_w2v=False)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Exemplo n.º 21
0
def main():
    parser = argparse.ArgumentParser(
        description="Converts a vecf file to dissect pkl format.")
    parser.add_argument('--input',
                        '-i',
                        type=argparse.FileType('r'),
                        help='Input file')
    parser.add_argument('--output',
                        '-o',
                        type=argparse.FileType('w'),
                        help='Output file')
    args = parser.parse_args()

    header = args.input.readline().rstrip()
    vocab_s, dims = map(int, header.split(" "))

    vocab = []

    # init matrix
    matrix = np.zeros((vocab_s, dims), dtype=np.float)

    for i, line in enumerate(args.input):
        data = line.split()
        vector = np.array(map(float, data[1:]))
        word = data[0]
        vocab.append(word)
        matrix[i] = vector

    dm = DenseMatrix(matrix)
    sp = Space(dm, vocab, [])
    pickle.dump(sp, args.output)
    args.output.close()
Exemplo n.º 22
0
    def test_space_compose_sparse(self):
        #WHAT TO DO HERE???
        #PARAMETERS ARE GIVEN AS DENSE MATRICES, INPUT DATA AS SPARSE??

        test_cases = [([("a", "b", "a_b")], self.space1, self.space2,
                       DenseMatrix.identity(2), DenseMatrix.identity(2)),
                      ([("a", "b", "a_b")], self.space1, self.space3,
                       np.mat([[0, 0], [0, 0]]), np.mat([[0, 0], [0, 0]]))]

        for in_data, arg_space, phrase_space, mat_a, mat_b in test_cases:
            comp_model = FullAdditive(A=mat_a, B=mat_b)
            comp_space = comp_model.compose(in_data, arg_space)

            np.testing.assert_array_almost_equal(
                comp_space.cooccurrence_matrix.mat.todense(),
                phrase_space.cooccurrence_matrix.mat.todense(), 10)
Exemplo n.º 23
0
    def test_mul_raises(self):
        test_cases = [(self.matrix_a, self.a),
                      (self.matrix_a, DenseMatrix(self.a)),
                      (self.matrix_a, "3")]

        for (term1, term2) in test_cases:
            self.assertRaises(TypeError, term1.__mul__, term2)
Exemplo n.º 24
0
def main():
    parser = argparse.ArgumentParser(
        'Converts a VW topic output to a COMPOSES pkl file.')
    parser.add_argument('--input',
                        '-i',
                        type=argparse.FileType('r'),
                        help='Input file')
    parser.add_argument('--docnames',
                        '-d',
                        type=argparse.FileType('r'),
                        help='Docnames file')
    parser.add_argument('--output',
                        '-o',
                        type=argparse.FileType('w'),
                        default=sys.stdout,
                        help='Output file')

    args = parser.parse_args()
    docnames = [l for l in (l.strip() for l in args.docnames) if l]
    matrix = None
    for i, line in enumerate(args.input):
        line = line.strip()
        weights = map(float, line.split(" "))
        if matrix is None:
            matrix = np.zeros((len(docnames), len(weights)), dtype=np.float)
        weights = np.array(weights)
        matrix[i] = weights

    dm = DenseMatrix(matrix)
    sp = Space(dm, docnames, [])
    pickle.dump(sp, args.output)
    args.output.close()
Exemplo n.º 25
0
    def setUp(self):
        self.a = np.array([[1, 2, 3], [4, 0, 5]])
        self.space_s = Space(SparseMatrix(np.mat(self.a)), ["a", "b"],
                             ["f1", "f2", "f3"])

        self.space_d = Space(DenseMatrix(np.mat(self.a)), ["a", "b"],
                             ["f1", "f2", "f3"])
    def test_nmf(self):
        test_cases = [np.mat([[1,2,3],[2,4,6],[4,17,13]], dtype = np.double),
                      np.mat([[1,0,0]], dtype = np.double)]

        for in_mat in test_cases:
            red = Nmf(2)
            d_mat = DenseMatrix(in_mat)
            #wd_init, hd_init = red.random_init(d_mat)
            wd_init, hd_init = red.v_col_init(d_mat)

            s_mat = SparseMatrix(in_mat)
            ws_init = SparseMatrix(wd_init)
            hs_init = SparseMatrix(hd_init)

            wd_mat, hd_mat = Linalg.nmf(d_mat, wd_init, hd_init)
            ws_mat, hs_mat = Linalg.nmf(s_mat, ws_init, hs_init)

            #TESTED IT AGAINST MATLAB IMPLEMENTATION - ALL GOOD
            #print wd_mat.mat
            #print hd_mat.mat
            #print ws_mat.mat.todense()
            #print hs_mat.mat.todense()
            print "V:", in_mat
            print "WH:", (ws_mat*hs_mat).mat.todense()

            np.testing.assert_array_almost_equal(wd_mat.mat,
                                                 ws_mat.mat.todense(), 2)
            np.testing.assert_array_almost_equal(hd_mat.mat,
                                                 hs_mat.mat.todense(), 2)
Exemplo n.º 27
0
def read_mikolov(spacefile):
    header = spacefile.readline().rstrip()
    vocab_s, dims = map(int, header.split(" "))

    vocab = []

    # init matrix
    matrix = np.zeros((vocab_s, dims), dtype=np.float)

    i = 0
    while True:
        line = spacefile.readline()
        if not line:
            break
        sep = line.find(" ")
        if sep == -1:
            raise ValueError(
                "Couldn't find the vocab/data separation character! Space file corruption?"
            )

        word = line[:sep]
        data = line[sep + 1:]
        if len(data) < FLOAT_SIZE * dims + 1:
            data += spacefile.read(FLOAT_SIZE * dims + 1 - len(data))
        data = data[:-1]
        vocab.append(word)
        vector = (struct.unpack("%df" % dims, data))
        matrix[i] = vector
        i += 1

    dm = DenseMatrix(matrix)
    sp = Space(dm, vocab, [])

    return sp
Exemplo n.º 28
0
    def project(self, matrix_):
        """
        Projects a dim. reduction operation.

        Args:
            matrix_: matrix on which the reduction is projected, of type Matrix

        Returns:
            the reduced matrix

        Uses the transformation matrix stored in the operation object to project
        the dimensionality reduction method on a new space, peripheral to the
        original one.
        """

        if self.__transmat is None:
            self._raise_projection_error(self.__dim_reduction)

        if self.__dim_reduction.name == "nmf":
            matrix_.assert_positive()

        if not isinstance(matrix_, type(self.__transmat)):
            warn(
                "WARNING: peripheral matrix type (dense/sparse) should be the same as the core space matrix type!!"
            )

        [matrix_, transmat] = resolve_type_conflict([matrix_, self.__transmat],
                                                    type(matrix_))

        result_mat = matrix_ * transmat

        if self.__dim_reduction.name == "nmf":
            result_mat.to_non_negative()

        return DenseMatrix(result_mat)
Exemplo n.º 29
0
def print_cooc_mat_dense_format(matrix_, id2row, file_prefix):
    matrix_file = "%s.%s" % (file_prefix, "dm")

    with open(matrix_file, 'w') as f:
        for i, row in enumerate(id2row):
            v = DenseMatrix(matrix_[i]).mat.flat
            line = "\t".join([row] + [repr(v[j]) for j in range(len(v))])
            f.write("%s\n" % (line))
Exemplo n.º 30
0
def to_matrix(matrix_):
    """
    Converts an array-like structure to a DenseMatrix/SparseMatrix
    """
    if issparse(matrix_):
        return SparseMatrix(matrix_)
    else:
        return DenseMatrix(matrix_)
Exemplo n.º 31
0
    def test_init_raise(self):
        test_cases = [(DenseMatrix(np.array([[1,2],[3,4],[5,6]])),
                       ["car", "man"], ["feat1", "feat2"],
                       {"man":1, "car":0}, {"feat1":0, "feat2":1}),
                      (DenseMatrix(np.array([[1,2],[3,4]])),
                       [], ["feat1", "feat2"],
                       {"man":1, "car":0}, {"feat1":0, "feat2":1}),
                      (DenseMatrix(np.array([[1,2],[3,4]])),
                       ["car", "man"], ["feat1", "feat2"],
                       {}, {"feat1":0, "feat2":1}),
                      (DenseMatrix(np.array([[1,2],[3,4]])),
                       ["car", "man"], ["feat1"],
                       {"man":1, "car":0}, {"feat1":0, "feat2":1}),
                      (DenseMatrix(np.array([[1,2],[3,4]])),
                       ["car", "man"], ["feat1"],
                       {"man":1, "car":0}, {"feat1":0, "feat2":1}),
                      (DenseMatrix(np.array([[1,2],[3,4]])),
                       ["car", "man"], ["feat1","feat2"],
                       {"man":1, "car":0}, {"feat1":0}),
                      (DenseMatrix(np.array([[1,2],[3,4]])),
                       ["car", "man"], ["feat1","feat2"],
                       {"man":1, "car":0}, {"feat1":1,"feat2":0})
                      ]

        for (m, id2row, id2col, row2id, col2id) in test_cases:
            self.assertRaises(ValueError, Space, m, id2row, id2col,
                              row2id, col2id)
Exemplo n.º 32
0
    def _dense_svd(matrix_, reduced_dimension):

        print "Running dense svd"
        u, s, vt = np.linalg.svd(matrix_.mat, False, True)
        rank = len(s[s > Linalg._SVD_TOL])

        no_cols = min(u.shape[1], reduced_dimension, rank)
        u = DenseMatrix(u[:,0:no_cols])
        s = s[0:no_cols]
        v = DenseMatrix(vt[0:no_cols,:].transpose())

        Linalg._check_reduced_dim(matrix_.shape[1], u.shape[1], reduced_dimension)

        if not u.is_mostly_positive():
            u = -u
            v = -v

        return u, s, v
Exemplo n.º 33
0
    def setUp(self):
        self.m1 = np.array([[1, 2, 3]])
        self.row1 = ["a"]
        self.ft1 = ["f1", "f2", "f3"]
        self.space1 = Space(DenseMatrix(self.m1), self.row1, self.ft1)

        self.m2 = np.array([[4, 2, 6]])
        self.row2 = ["b"]
        self.row3 = ["a", "b", "c"]

        self.x = np.mat([[1, 2, 3], [2, 4, 6], [4, 675, 43]])
        self.us = np.mat([[2.19272110e+00, 3.03174768e+00],
                          [4.38544220e+00, 6.06349536e+00],
                          [6.76369708e+02, -4.91431927e-02]])
        self.us2 = np.mat([[2.19272110e+00], [4.38544220e+00],
                           [6.76369708e+02]])

        self.space2 = Space(DenseMatrix(self.x), self.row3, self.ft1)
Exemplo n.º 34
0
    def test_dense_lstsq_regression(self):

        test_cases = self.pinv_test_cases
        for m, m_inv in test_cases:
            m1 = DenseMatrix(m)
            id_ = DenseMatrix.identity(m1.shape[0])

            res = Linalg.lstsq_regression(m1, id_)
            np.testing.assert_array_almost_equal(res.mat, m_inv, 7)
Exemplo n.º 35
0
    def test_space_compose_dense(self):

        test_cases = [([("a","b","a_b")], self.space4, self.space5, DenseMatrix.identity(2), DenseMatrix.identity(2)),
                      ([("a","b","a_b")], self.space4, self.space6, np.mat([[0,0],[0,0]]), np.mat([[0,0],[0,0]])),
                      ([("a","b","a_b"),("a","b","a_a")], self.space4, self.space7, DenseMatrix.identity(2), DenseMatrix.identity(2)),
                      ]
        
        for in_data, arg_space, phrase_space, mat_a, mat_b in test_cases:
            comp_model = FullAdditive(A=mat_a, B=mat_b)
            comp_space = comp_model.compose(in_data, arg_space)
            
            np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat,
                                                 phrase_space.cooccurrence_matrix.mat, 10)
                  
            self.assertListEqual(comp_space.id2column, [])
            self.assertDictEqual(comp_space.column2id, {})
            
            self.assertListEqual(comp_space.id2row, phrase_space.id2row)
            self.assertDictEqual(comp_space.row2id, phrase_space.row2id)
            
            self.assertFalse(comp_model._has_intercept)
Exemplo n.º 36
0
 def test_space_compose_sparse(self):
     #WHAT TO DO HERE???
     #PARAMTERS ARE GIVEN AS DENSE MATRICES, INPUT DATA AS SPARSE??
     
     test_cases = [([("a","b","a_b")], self.space1, self.space2, DenseMatrix.identity(2), DenseMatrix.identity(2)),
                   ([("a","b","a_b")], self.space1, self.space3, np.mat([[0,0],[0,0]]), np.mat([[0,0],[0,0]]))
                   ]
     
     for in_data, arg_space, phrase_space, mat_a, mat_b in test_cases:
         comp_model = FullAdditive(A=mat_a, B=mat_b)
         comp_space = comp_model.compose(in_data, arg_space)
         
         np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat.todense(),
                                              phrase_space.cooccurrence_matrix.mat.todense(), 10)
Exemplo n.º 37
0
    def setUp(self):
        self.a = np.array([[1, 2, 3], [4, 0, 5]])
        self.b = np.array([[0, 0, 0], [0, 0, 0]])

        self.c = np.array([[0, 0], [0, 0], [0, 0]])
        self.d = np.array([[1, 0], [0, 1]])
        self.e = np.array([1, 10])
        self.f = np.array([1, 10, 100])

        self.matrix_a = DenseMatrix(self.a)
        self.matrix_b = DenseMatrix(self.b)

        self.matrix_c = DenseMatrix(self.c)
        self.matrix_d = DenseMatrix(self.d)
Exemplo n.º 38
0
    def test_dense_ridge_regression(self):
        test_cases = self.pinv_test_cases
        for m, m_inv in test_cases:
            m1 = DenseMatrix(m)
            id_ = DenseMatrix.identity(m1.shape[0])

            res1 = Linalg.lstsq_regression(m1, id_)
            np.testing.assert_array_almost_equal(res1.mat, m_inv, 7)

            res2 = Linalg.ridge_regression(m1, id_, 1)[0]

            error1 = (m1 * res1 - DenseMatrix(m_inv)).norm()
            error2 = (m1 * res2 - DenseMatrix(m_inv)).norm()

            #print "err", error1, error2

            norm1 = error1 + res1.norm()
            norm2 = error2 + res2.norm()

            #print "norm", norm1, norm2

            #THIS SHOULD HOLD, BUT DOES NOT, MAYBE ROUNDING ERROR?
            #self.assertGreaterEqual(error2, error1)
            self.assertGreaterEqual(norm1, norm2)
Exemplo n.º 39
0
 def compute_matreps(self,vecspace,matspace,multiply_matrices=False):
     '''
     This method computes symbolic and numeric matrix representations od a 
     papfunc node, taking as input a vector space, a matrix space. An optional Boolean argument, if set to True, makes matrices to be multiplied rather than summed when both subconstituents have arity greater than 0.
     '''
     # for terminal nodes do lexical insertions by calling 
     #insert_terminal_node_representation
     if self.is_terminal():
         matrep,temp_numrep=self.insert_terminal_node_representation(vecspace,matspace)
         self._matrep = matrep
         if temp_numrep[0] == "empty":
             numrep = [] #default semantic representation for syntactic elements we ignore
         else:
             numrep = [temp_numrep[0].transpose()]
             dimensionality=(temp_numrep[0].shape[1])
             if len(temp_numrep)>1:
                 # Matrices are "flattened", stored as vectors.
                 # We reshape each matrix to a normal shape (usually square)
                 for x in range(1, (len(temp_numrep))):
                     y = DenseMatrix(temp_numrep[x])
                     y.reshape((dimensionality,(y.shape[1]/dimensionality)))
                     numrep.append(y)
         self._numrep = numrep
     #raise an exception for a non-terminal node without children
     elif len(self._children) == 0:
         raise ValueError("Non-terminal non-branching node!")
     # inherit the value of the single daughter in case of unary branching
     if len(self._children) == 1:
         self._matrep = self.get_child(0)._matrep
         self._numrep = self.get_child(0)._numrep
     #apply composition for binary branching nodes
     if len(self._children) == 2 and self._matrep == []:
         matrep1=self.get_child(0)._matrep
         if not matrep1:
             raise ValueError("Empty matrix representation for node %s!" %self.get_child(0))
         matrep2=self.get_child(1)._matrep
         if not matrep2:
             raise ValueError("Empty matrix representation for node %s!" %self.get_child(1))
         #get the arity of two daughter nodes in order to determine which of
         #them is the function and which is the argument
         arity1=len(matrep1)-1
         arity2=len(matrep2)-1
         # first, compute symbolic matrix representation
         if arity1-arity2 == 0:
             for x in range(0, arity1+1):
                 self._matrep.append('(' + matrep1[x] + '+' + matrep2[x] + ')')
         #left application
         if arity1 < arity2 and not re.search('empty$',matrep2[0]) and not re.search('empty$',matrep1[0]):
             for x in range(0, arity2):
                 if x == 0: # compute vector of the mother node
                     self._matrep.append('(' + matrep2[x] + '+' + matrep2[arity2] + '*' + matrep1[x] + ')')
                 elif x < len(matrep1): # compute matrices of the mother node
                     if multiply_matrices: self._matrep.append('(' + matrep2[x] + '*' + matrep1[x] + ')')
                     else: self._matrep.append('(' + matrep2[x] + '+' + matrep1[x] + ')')
                 else:
                     self._matrep.append(matrep2[x])
         #right application
         if arity1 > arity2 and not re.search('empty$',matrep2[0]) and not re.search('empty$',matrep1[0]):
             for x in range(0, arity1):
                 if x == 0:
                     self._matrep.append('(' + matrep1[x] + '+' + matrep1[arity1] + '*' + matrep2[x] + ')')
                 elif x < len(matrep2):
                     if multiply_matrices: self._matrep.append('(' + matrep1[x] + '*' + matrep2[x] + ')')
                     else: self._matrep.append('(' + matrep1[x] + '+' + matrep2[x] + ')')
                 else:
                     self._matrep.append(matrep1[x])
     #if one of the daughters is 'empty' (marked to be ignored), ignore it
         if re.search('empty$',matrep1[0]):
             self._matrep = matrep2
         if re.search('empty$',matrep2[0]):
             self._matrep = matrep1
         # computing numeric matrix representation of a node from those of 
         # its two daughters.
         # First, get arity of the daughters to establish the directionality
         # of function application
         numrep1=self.get_child(0)._numrep
         numrep2=self.get_child(1)._numrep
         if arity1-arity2 == 0 and numrep1 and numrep2:
             for x in range(0, arity1+1):
                 self._numrep.append(numrep1[x].__add__(numrep2[x]))
         #left application
         if arity1 < arity2 and not numrep1==[] and not numrep2==[]:
             for x in range(0, arity2):
                 # compute the vector
                 if x == 0:
                     self._numrep.append(numrep2[x].__add__(numrep2[arity2] * numrep1[x]))
                 # compute a matrix
                 elif x < len(numrep1):
                     if multiply_matrices:
                         self._numrep.append(numrep2[x] * numrep1[x])
                     else:
                         self._numrep.append(numrep1[x].__add__(numrep2[x]))
                 else:
                     self._numrep.append(numrep2[x])
         #right aplication
         if arity1 > arity2 and not numrep1==[] and not numrep2==[]:
             for x in range(0, arity1):
                 if x == 0:
                     self._numrep.append(numrep1[x].__add__(numrep1[arity1]*numrep2[x]))
                 elif x < len(numrep2):
                     if multiply_matrices:
                         self._numrep.append(numrep2[x] * numrep1[x])
                     else:
                         self._numrep.append(numrep1[x].__add__(numrep2[x]))
                 else:
                     self._numrep.append(numrep1[x])
         # ignore 'empty' elements in composition
         if (numrep1 == []):
             self._numrep = numrep2
         if (numrep2 == []):
             self._numrep = numrep1
     # end of numrep computation  
     # Raise an exception for non-binary branching - we don't want to handle those structures
     if len(self._children)>2:
         raise ValueError("Matrix representations are not defined for trees with more than binary branching")
Exemplo n.º 40
0
 def train(self, matrix_a, matrix_b=None):
     """
     matrix_b is ignored
     """
     W = DenseMatrix.identity(matrix_a.shape[1])
     return W
Exemplo n.º 41
0
class TestDenseMatrix(unittest.TestCase):
    def setUp(self):
        self.a = np.array([[1, 2, 3], [4, 0, 5]])
        self.b = np.array([[0, 0, 0], [0, 0, 0]])

        self.c = np.array([[0, 0], [0, 0], [0, 0]])
        self.d = np.array([[1, 0], [0, 1]])
        self.e = np.array([1, 10])
        self.f = np.array([1, 10, 100])

        self.matrix_a = DenseMatrix(self.a)
        self.matrix_b = DenseMatrix(self.b)

        self.matrix_c = DenseMatrix(self.c)
        self.matrix_d = DenseMatrix(self.d)

    def tearDown(self):
        pass

    def test_init(self):
        nparr = self.a
        test_cases = [nparr, np.mat(nparr), csr_matrix(nparr), csc_matrix(nparr), SparseMatrix(nparr)]

        for inmat in test_cases:
            outmat = DenseMatrix(inmat)
            self.assertIsInstance(outmat.mat, np.matrix)
            numpy.testing.assert_array_equal(nparr, np.array(outmat.mat))

    def test_add(self):
        test_cases = [
            (self.matrix_a, self.matrix_a, np.mat([[2, 4, 6], [8, 0, 10]])),
            (self.matrix_a, self.matrix_b, self.matrix_a.mat),
        ]

        for (term1, term2, expected) in test_cases:
            sum_ = term1 + term2
            numpy.testing.assert_array_equal(sum_.mat, expected)
            self.assertIsInstance(sum_, type(term1))

    def test_add_raises(self):
        test_cases = [(self.matrix_a, self.a), (self.matrix_a, SparseMatrix(self.a))]

        for (term1, term2) in test_cases:
            self.assertRaises(TypeError, term1.__add__, term2)

    def test_div(self):
        test_cases = [
            (self.matrix_a, 2, np.mat([[0.5, 1.0, 1.5], [2.0, 0.0, 2.5]])),
            (self.matrix_c, 2, np.mat(self.c)),
        ]

        for (term1, term2, expected) in test_cases:
            sum_ = term1 / term2
            numpy.testing.assert_array_equal(sum_.mat, expected)
            self.assertIsInstance(sum_, DenseMatrix)

    def test_div_raises(self):
        test_cases = [
            (self.matrix_a, self.a, TypeError),
            (self.matrix_a, SparseMatrix(self.a), TypeError),
            (self.matrix_a, "3", TypeError),
            (self.matrix_a, 0, ZeroDivisionError),
        ]

        for (term1, term2, error_type) in test_cases:
            self.assertRaises(error_type, term1.__div__, term2)

    def test_mul(self):
        test_cases = [
            (self.matrix_a, self.matrix_c, np.mat([[0, 0], [0, 0]])),
            (self.matrix_d, self.matrix_a, self.matrix_a.mat),
            (self.matrix_a, 2, np.mat([[2, 4, 6], [8, 0, 10]])),
            (2, self.matrix_a, np.mat([[2, 4, 6], [8, 0, 10]])),
            (self.matrix_a, np.int64(2), np.mat([[2, 4, 6], [8, 0, 10]])),
            (np.int64(2), self.matrix_a, np.mat([[2, 4, 6], [8, 0, 10]])),
        ]

        for (term1, term2, expected) in test_cases:
            sum_ = term1 * term2
            numpy.testing.assert_array_equal(sum_.mat, expected)
            self.assertIsInstance(sum_, DenseMatrix)

    def test_mul_raises(self):
        test_cases = [
            (self.matrix_a, self.a),
            (self.matrix_a, SparseMatrix(self.a)),
            (self.matrix_a, "3"),
            ("3", self.matrix_a),
        ]

        for (term1, term2) in test_cases:
            self.assertRaises(TypeError, term1.__mul__, term2)

    def test_multiply(self):
        test_cases = [
            (self.matrix_a, self.matrix_a, np.mat([[1, 4, 9], [16, 0, 25]])),
            (self.matrix_a, self.matrix_b, np.mat(self.b)),
        ]

        for (term1, term2, expected) in test_cases:
            mult1 = term1.multiply(term2)
            mult2 = term2.multiply(term1)

            numpy.testing.assert_array_equal(mult1.mat, expected)
            numpy.testing.assert_array_equal(mult2.mat, expected)

            self.assertIsInstance(mult1, DenseMatrix)
            self.assertIsInstance(mult2, DenseMatrix)

    def test_multiply_raises(self):

        test_cases = [
            (self.matrix_a, self.matrix_d, ValueError),
            (self.matrix_a, self.a, TypeError),
            (self.matrix_a, SparseMatrix(self.a), TypeError),
        ]

        for (term1, term2, error_type) in test_cases:
            self.assertRaises(error_type, term1.multiply, term2)

    def test_scale_rows(self):
        outcome = np.mat([[1, 2, 3], [40, 0, 50]])
        test_cases = [(self.matrix_a, self.e, outcome), (self.matrix_a, np.mat(self.e).T, outcome)]

        for (term1, term2, expected) in test_cases:
            term1 = term1.scale_rows(term2)
            numpy.testing.assert_array_equal(term1.mat, expected)

    def test_scale_columns(self):
        test_cases = [(self.matrix_a, self.f, np.mat([[1, 20, 300], [4, 0, 500]]))]

        for (term1, term2, expected) in test_cases:
            term1 = term1.scale_columns(term2)
            numpy.testing.assert_array_equal(term1.mat, expected)

    def test_scale_raises(self):
        test_cases = [
            (self.matrix_a, self.f, ValueError, self.matrix_a.scale_rows),
            (self.matrix_a, self.e, ValueError, self.matrix_a.scale_columns),
            (self.matrix_a, self.b, ValueError, self.matrix_a.scale_rows),
            (self.matrix_a, self.b, ValueError, self.matrix_a.scale_columns),
            (self.matrix_a, "3", TypeError, self.matrix_a.scale_rows),
        ]
        for (term1, term2, error_type, function) in test_cases:
            self.assertRaises(error_type, function, term2)

    def test_plog(self):
        m = DenseMatrix(np.mat([[0.5, 1.0, 1.5], [2.0, 0.0, 2.5]]))
        m_expected = np.mat([[0.0, 0.0, 0.4054], [0.6931, 0.0, 0.9162]])
        a_expected = np.mat([[0.0, 0.6931, 1.0986], [1.3862, 0.0, 1.6094]])
        test_cases = [(self.matrix_a.copy(), a_expected), (m, m_expected)]

        for (term, expected) in test_cases:
            term.plog()
            numpy.testing.assert_array_almost_equal(term.mat, expected, 3)
Exemplo n.º 42
0
    def test_3d(self):

        # setting up
        v_mat = DenseMatrix(np.mat([[0,0,1,1,2,2,3,3],#hate
                                    [0,1,2,4,5,6,8,9]])) #love


        vo11_mat = DenseMatrix(np.mat([[0,11],[22,33]])) #hate boy
        vo12_mat = DenseMatrix(np.mat([[0,7],[14,21]])) #hate man
        vo21_mat = DenseMatrix(np.mat([[6,34],[61,94]])) #love boy
        vo22_mat = DenseMatrix(np.mat([[2,10],[17,26]])) #love car

        train_vo_data = [("hate_boy", "man", "man_hate_boy"),
                      ("hate_man", "man", "man_hate_man"),
                      ("hate_boy", "boy", "boy_hate_boy"),
                      ("hate_man", "boy", "boy_hate_man"),
                      ("love_car", "boy", "boy_love_car"),
                      ("love_boy", "man", "man_love_boy"),
                      ("love_boy", "boy", "boy_love_boy"),
                      ("love_car", "man", "man_love_car")
                      ]

        # if do not find a phrase
        # what to do?
        train_v_data = [("love", "boy", "love_boy"),
                        ("hate", "man", "hate_man"),
                        ("hate", "boy", "hate_boy"),
                        ("love", "car", "love_car")]


        sentences = ["man_hate_boy", "car_hate_boy", "boy_hate_boy",
                     "man_hate_man", "car_hate_man", "boy_hate_man",
                     "man_love_boy", "car_love_boy", "boy_love_boy",
                     "man_love_car", "car_love_car", "boy_love_car" ]
        n_mat = DenseMatrix(np.mat([[3,4],[1,2],[5,6]]))


        n_space = Space(n_mat, ["man", "car", "boy"], self.ft)

        s1_mat = (vo11_mat * n_mat.transpose()).transpose()
        s2_mat = (vo12_mat * n_mat.transpose()).transpose()
        s3_mat = (vo21_mat * n_mat.transpose()).transpose()
        s4_mat = (vo22_mat * n_mat.transpose()).transpose()

        s_mat = vo11_mat.nary_vstack([s1_mat,s2_mat,s3_mat,s4_mat])
        s_space = Space(s_mat, sentences, self.ft)

        #test train 2d
        model = LexicalFunction(learner=LstsqRegressionLearner(intercept=False))
        model._MIN_SAMPLES = 1
        model.train(train_vo_data, n_space, s_space)
        vo_space = model.function_space

        self.assertListEqual(vo_space.id2row, ["hate_boy", "hate_man","love_boy", "love_car"])
        self.assertTupleEqual(vo_space.element_shape, (2,2))
        vo11_mat.reshape((1,4))
        np.testing.assert_array_almost_equal(vo11_mat.mat,
                                             vo_space.cooccurrence_matrix.mat[0])
        vo12_mat.reshape((1,4))
        np.testing.assert_array_almost_equal(vo12_mat.mat,
                                             vo_space.cooccurrence_matrix.mat[1])
        vo21_mat.reshape((1,4))
        np.testing.assert_array_almost_equal(vo21_mat.mat,
                                             vo_space.cooccurrence_matrix.mat[2])
        vo22_mat.reshape((1,4))
        np.testing.assert_array_almost_equal(vo22_mat.mat,
                                             vo_space.cooccurrence_matrix.mat[3])

        # test train 3d
        model2 = LexicalFunction(learner=LstsqRegressionLearner(intercept=False))
        model2._MIN_SAMPLES = 1
        model2.train(train_v_data, n_space, vo_space)
        v_space = model2.function_space
        np.testing.assert_array_almost_equal(v_mat.mat,
                                             v_space.cooccurrence_matrix.mat)
        self.assertListEqual(v_space.id2row, ["hate","love"])
        self.assertTupleEqual(v_space.element_shape, (2,2,2))

        # test compose 3d
        vo_space2 = model2.compose(train_v_data, n_space)
        id2row1 = list(vo_space.id2row)
        id2row2 = list(vo_space2.id2row)
        id2row2.sort()
        self.assertListEqual(id2row1, id2row2)
        row_list = vo_space.id2row
        vo_rows1 = vo_space.get_rows(row_list)
        vo_rows2 = vo_space2.get_rows(row_list)
        np.testing.assert_array_almost_equal(vo_rows1.mat, vo_rows2.mat,7)
        self.assertTupleEqual(vo_space.element_shape, vo_space2.element_shape)
Exemplo n.º 43
0
    def compute_matreps(self,vecspace,matspace,multiply_matrices=False):
        '''
        This method computes symbolic and numeric matrix representations od a 
        papfunc node, taking as input a vector space, a matrix space. An optional Boolean argument, if set to True, makes matrices to be multiplied rather than summed when both subconstituents have arity greater than 0.
        '''
        # for terminal nodes call insert_terminal_node_representation
        if self.is_terminal():
            matrep,temp_numrep=self.insert_terminal_node_representation(vecspace,matspace)
            self._matrep = matrep
            if temp_numrep[0] == "empty":
                numrep = []
            else:
                numrep = [temp_numrep[0].transpose()]
                dimensionality=(temp_numrep[0].shape[1])
                if len(temp_numrep)>1:
            # all matrices are stored flattened, as long vectors. We need to 
            # reshape them before we use them in computations
                    for x in range(1, (len(temp_numrep))):
                        y = DenseMatrix(temp_numrep[x])
                        y.reshape((dimensionality,(y.shape[1]/dimensionality)))
                        numrep.append(y)
            self._numrep = numrep
        #raise an exception for a non-terminal node without children
        elif len(self._children) == 0:
            raise ValueError("Non-terminal non-branching node!")
        # inherit the value of the single daughter in case of unary branching
        if len(self._children) == 1:
            self._matrep = self.get_child(0)._matrep
            self._numrep = self.get_child(0)._numrep
        #apply composition for binary branching nodes
        if len(self._children) == 2 and self._matrep == []:
            matrep1=self.get_child(0)._matrep
        #ignore 'empty' nodes
            if not matrep1:
                raise ValueError("Empty matrix representation for node %s!" %self.get_child(0))
            matrep2=self.get_child(1)._matrep
            if not matrep2:
                raise ValueError("Empty matrix representation for node %s!" %self.get_child(1))
            arity1=len(matrep1)-1
            arity2=len(matrep2)-1
            # first, compute symbolic matrix representation
            # default to componentwise addition for daughters of equal arity
            if arity1-arity2 == 0:
                for x in range(0, arity1+1):
                    self._matrep.append('(' + matrep1[x] + '+' + matrep2[x] + ')')
            # left function application
            if arity1 < arity2 and not re.search('empty$',matrep2[0]) and not re.search('empty$',matrep1[0]):
                for x in range(0, arity2):
                    if x == 0: #compute the vector
                        self._matrep.append('(' + matrep2[x] + '+' + matrep2[arity2] + '*' + matrep1[x] + ')')
                    # compute a matrix
                    # If both daughters have matrices in the xth position in
                    # their vector-matrix structures, add or multiply those 
                    # matrices according to the multiply_matrices parameter
                    elif x < len(matrep1):
                        if multiply_matrices: self._matrep.append('(' + matrep2[x] + '*' + matrep1[x] + ')')
                        else: self._matrep.append('(' + matrep2[x] + '+' + matrep1[x] + ')')
                    # inherit the function's extra lexical matrix
                    else:
                        self._matrep.append(matrep2[x])
            # right function application
            if arity1 > arity2 and not re.search('empty$',matrep2[0]) and not re.search('empty$',matrep1[0]):
                for x in range(0, arity1):
                    if x == 0:
                        self._matrep.append('(' + matrep1[x] + '+' + matrep1[arity1] + '*' + matrep2[x] + ')')
                    # compute a matrix
                    # If both daughters have matrices in the xth position in
                    # their vector-matrix structures, add or multiply those
                    # matrices according to the multiply_matrices parameter
                    elif x < len(matrep2):
                        if multiply_matrices: self._matrep.append('(' + matrep1[x] + '*' + matrep2[x] + ')')
                        else: self._matrep.append('(' + matrep1[x] + '+' + matrep2[x] + ')')
                    else:
                        self._matrep.append(matrep1[x])
# ignore 'empty' elements
            if re.search('empty$',matrep1[0]):
                self._matrep = matrep2
            if re.search('empty$',matrep2[0]):
                self._matrep = matrep1
            # computing numeric matrix representation of a node from those of its two daughters    
            numrep1=self.get_child(0)._numrep
            numrep2=self.get_child(1)._numrep
            if arity1-arity2 == 0 and numrep1 and numrep2:
                for x in range(0, arity1+1):
                    self._numrep.append(numrep1[x].__add__(numrep2[x]))
            # left function application
            if arity1 < arity2 and not numrep1==[] and not numrep2==[]:
                for x in range(0, arity2):
                    if x == 0: #compute the vector
                        self._numrep.append(numrep2[x].__add__(numrep2[arity2] * padd_matrix(numrep1[x],0)))
                    elif x < len(numrep1):
                        if multiply_matrices:
                            self._numrep.append(numrep2[x] * numrep1[x])
                        else:
                            self._numrep.append(numrep1[x].__add__(numrep2[x]))
                    else:
                        self._numrep.append(numrep2[x])
            # right function application
            if arity1 > arity2 and not numrep1==[] and not numrep2==[]:
                for x in range(0, arity1):
                    if x == 0: # compute the vector
                        self._numrep.append(numrep1[x].__add__(numrep1[arity1] * padd_matrix(numrep2[x],0)))
                    elif x < len(numrep2):
                        if multiply_matrices:
                            self._numrep.append(numrep2[x] * numrep1[x])
                        else:
                            self._numrep.append(numrep1[x].__add__(numrep2[x]))
                    else:
                        self._numrep.append(numrep1[x])
            # ignore 'empty' elements
            if (numrep1 == []):
                self._numrep = numrep2
            if (numrep2 == []):
                self._numrep = numrep1
        # end of numrep computation  
        # Raise an exception for non-binary branching - we don't want to handle those structures
        if len(self._children)>2:
            raise ValueError("Matrix representations are not defined for trees with more than binary branching")
Exemplo n.º 44
0
    def tracenorm_regression(matrix_a , matrix_b, lmbd, iterations, intercept=False):
        #log.print_info(logger, "In Tracenorm regression..", 4)
        #log.print_matrix_info(logger, matrix_a, 5, "Input matrix A:")
        #log.print_matrix_info(logger, matrix_b, 5, "Input matrix B:")
        """
        Performs Trace Norm Regression.

        This method uses approximate gradient descent
        to solve the problem:
            :math:`X = argmin(||AX - B||_2 + \\lambda||X||_*)`
        where :math:`||X||_*` is the trace norm of :math:`X`, the sum of its
        singular values.
        It is implemented for dense matrices only.
        The algorithm is the Extended Gradient Algorithm from (Ji and Ye, 2009).

        Args:
            matrix_a: input matrix A, of type Matrix
            matrix_b: input matrix A, of type Matrix. If None, it is defined as matrix_a
            lambda_: scalar, lambda parameter
            intercept: bool. If True intercept is used. Optional, default False.

        Returns:
            solution X of type Matrix

        """

        if intercept:
            matrix_a = matrix_a.hstack(matrix_type(np.ones((matrix_a.shape[0],
                                                             1))))
        if matrix_b == None:
            matrix_b = matrix_a

        
        # TODO remove this
        matrix_a = DenseMatrix(matrix_a).mat
        matrix_b = DenseMatrix(matrix_b).mat

        # Matrix shapes
        p = matrix_a.shape[0]
        q = matrix_a.shape[1]
        assert_same_shape(matrix_a, matrix_b, 0)

        # Initialization of the algorithm
        W = (1.0/p)* Linalg._kronecker_product(matrix_a)

        # Sub-expressions reused at various places in the code
        matrix_a_t = matrix_a.transpose()
        at_times_a = np.dot(matrix_a_t, matrix_a)

        # Epsilon: to ensure that our bound on the Lipschitz constant is large enough
        epsilon_lbound = 0.05
        # Expression of the bound of the Lipschitz constant of the cost function
        L_bound = (1+epsilon_lbound)*2*Linalg._frobenius_norm_squared(at_times_a)
        # Current "guess" of the local Lipschitz constant
        L = 1.0
        # Factor by which L should be increased when it happens to be too small
        gamma = 1.2
        # Epsilon to ensure that mu is increased when the inequality hold tightly
        epsilon_cost = 0.00001
        # Real lambda: resized according to the number of training samples (?)
        lambda_ = lmbd*p
        # Variables used for the accelerated algorithm (check the original paper)
        Z = W
        alpha = 1.0
        # Halting condition
        epsilon = 0.00001
        last_cost = 1
        current_cost = -1
        linalg_error_caught = False

        costs = []
        iter_counter = 0
        while iter_counter < iterations and (abs((current_cost - last_cost)/last_cost)>epsilon) and not linalg_error_caught:
            sys.stdout.flush()
            # Cost tracking
            try:
                next_W, tracenorm = Linalg._next_tracenorm_guess(matrix_a, matrix_b, lambda_, L, Z, at_times_a)
            except LinAlgError:
                print "LinAlgError caught in trace norm regression"
                linalg_error_caught = True
                break

            last_cost = current_cost
            current_fitness = Linalg._fitness(matrix_a, matrix_b, next_W)
            current_cost = current_fitness + lambda_ * tracenorm
            if iter_counter > 0: # The first scores are messy
                cost_list =  [L, L_bound, current_fitness, current_cost]
                costs.append(cost_list)

            while (current_fitness + epsilon_cost >=
                    Linalg._intermediate_cost(matrix_a, matrix_b, next_W, Z, L)):
                if L > L_bound:
                    print "Trace Norm Regression: numerical error detected at iteration "+str(iter_counter)
                    break
                L = gamma * L
                try:
                    next_W, tracenorm = Linalg._next_tracenorm_guess(matrix_a, matrix_b, lambda_, L, Z, at_times_a)
                except LinAlgError:
                    print "LinAlgError caught in trace norm regression"
                    linalg_error_caught = True
                    break

                last_cost = current_cost
                current_fitness = Linalg._fitness(matrix_a, matrix_a, next_W)
                current_cost = current_fitness + lambda_*tracenorm

            if linalg_error_caught:
                break

            previous_W = W
            W = next_W
            previous_alpha = alpha
            alpha = (1.0 + sqrt(1.0 + 4.0*alpha*alpha))/2.0
            Z = W
            # Z = W + ((alpha - 1)/alpha)*(W - previous_W)
            iter_counter += 1

        sys.stdout.flush()
        W = np.real(W)
        return DenseMatrix(W), costs