예제 #1
0
    def project(self, matrix_):
        """
        Projects a dim. reduction operation.

        Args:
            matrix_: matrix on which the reduction is projected, of type Matrix

        Returns:
            the reduced matrix

        Uses the transformation matrix stored in the operation object to project
        the dimensionality reduction method on a new space, peripheral to the
        original one.
        """

        if self.__transmat is None:
            self._raise_projection_error(self.__dim_reduction)

        if self.__dim_reduction.name == "nmf":
            matrix_.assert_positive()

        if not isinstance(matrix_, type(self.__transmat)):
            warn("WARNING: peripheral matrix type (dense/sparse) should be the same as the core space matrix type!!")

        [matrix_, transmat] = resolve_type_conflict([matrix_, self.__transmat],
                                                        type(matrix_))

        result_mat = matrix_ * transmat

        if self.__dim_reduction.name == "nmf":
            result_mat.to_non_negative()

        return DenseMatrix(result_mat)
예제 #2
0
    def xxx(self, arg1_space, arg2_space, phrase_space, arg1_list, arg2_list,
            phrase_list):

        # we try to achieve at most MAX_MEM_OVERHEAD*phrase_space memory overhead
        # the /3.0 is needed
        # because the train data needs 3 * len(train_data) memory (arg1 vector, arg2 vector, phrase vector)
        chunk_size = int(phrase_space.cooccurrence_matrix.shape[0] *
                         self.MAX_MEM_OVERHEAD / 3.0) + 1

        arg1_arg2_dot, arg1_phrase_dot, arg2_phrase_dot, arg1_norm_sqr, arg2_norm_sqr = (
            0, 0, 0, 0, 0)

        for i in range(len(arg1_list) / chunk_size):
            beg, end = i * chunk_size, min((i + 1) * chunk_size,
                                           len(arg1_list))

            arg1_mat = arg1_space.get_rows(arg1_list[beg:end])
            arg2_mat = arg2_space.get_rows(arg2_list[beg:end])
            phrase_mat = phrase_space.get_rows(phrase_list[beg:end])

            [arg1_mat, arg2_mat, phrase_mat
             ] = resolve_type_conflict([arg1_mat, arg2_mat, phrase_mat],
                                       DenseMatrix)

            res = self._train1(arg1_mat, arg2_mat, phrase_mat)
            arg1_arg2_dot += res[0]
            arg1_phrase_dot += res[1]
            arg2_phrase_dot += res[2]
            arg1_norm_sqr += res[3]
            arg2_norm_sqr += res[4]

        self._train2(arg1_arg2_dot, arg1_phrase_dot, arg2_phrase_dot,
                     arg1_norm_sqr, arg2_norm_sqr)
예제 #3
0
    def project(self, matrix_):
        """
        Projects a dim. reduction operation.

        Args:
            matrix_: matrix on which the reduction is projected, of type Matrix

        Returns:
            the reduced matrix

        Uses the transformation matrix stored in the operation object to project
        the dimensionality reduction method on a new space, peripheral to the
        original one.
        """

        if self.__transmat is None:
            self._raise_projection_error(self.__dim_reduction)

        if self.__dim_reduction.name == "nmf":
            matrix_.assert_positive()

        if not isinstance(matrix_, type(self.__transmat)):
            warn(
                "WARNING: peripheral matrix type (dense/sparse) should be the same as the core space matrix type!!"
            )

        [matrix_, transmat] = resolve_type_conflict([matrix_, self.__transmat],
                                                    type(matrix_))

        result_mat = matrix_ * transmat

        if self.__dim_reduction.name == "nmf":
            result_mat.to_non_negative()

        return DenseMatrix(result_mat)
예제 #4
0
    def xxx(self, arg1_space, arg2_space, phrase_space, arg1_list, arg2_list, phrase_list):

        # we try to achieve at most MAX_MEM_OVERHEAD*phrase_space memory overhead
        # the /3.0 is needed
        # because the train data needs 3 * len(train_data) memory (arg1 vector, arg2 vector, phrase vector)
        chunk_size = int(phrase_space.cooccurrence_matrix.shape[0] * self.MAX_MEM_OVERHEAD / 3.0) + 1

        arg1_arg2_dot, arg1_phrase_dot, arg2_phrase_dot, arg1_norm_sqr, arg2_norm_sqr = (0, 0, 0, 0, 0)

        for i in range(len(arg1_list) / chunk_size):
            beg, end = i*chunk_size, min((i+1)*chunk_size, len(arg1_list))

            arg1_mat = arg1_space.get_rows(arg1_list[beg:end])
            arg2_mat = arg2_space.get_rows(arg2_list[beg:end])
            phrase_mat = phrase_space.get_rows(phrase_list[beg:end])

            [arg1_mat, arg2_mat, phrase_mat] = resolve_type_conflict([arg1_mat,
                                                                      arg2_mat,
                                                                      phrase_mat],
                                                                      DenseMatrix)

            res = self._train1(arg1_mat, arg2_mat, phrase_mat)
            arg1_arg2_dot += res[0]
            arg1_phrase_dot += res[1]
            arg2_phrase_dot += res[2]
            arg1_norm_sqr += res[3]
            arg2_norm_sqr += res[4]


        self._train2(arg1_arg2_dot, arg1_phrase_dot, arg2_phrase_dot, arg1_norm_sqr, arg2_norm_sqr)
예제 #5
0
    def compose(self, data, arg_space):
        """
        Uses a composition model to compose elements.

        Args:
            data: data to be composed. List of tuples, each containing 3
            strings: (arg1, arg2, composed_phrase). arg1 and arg2 are the
            elements to be composed and composed_phrase is the string associated
            to their composition.

            arg_space: argument space(s). Space object or a tuple of two
            Space objects (e.g. my_space, or (my_space1, my_space2)).
            If two spaces are provided, arg1 elements of data are
            interpreted in space1, and arg2 in space2.

        Returns:
            composed space: a new object of type Space, containing the
            phrases obtained through composition.

        """
        start = time.time()

        arg1_space, arg2_space = self.extract_arg_spaces(arg_space)
        arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(data,
                                                                     (arg1_space.row2id,
                                                                      arg2_space.row2id,
                                                                      None))
        
        # we try to achieve at most MAX_MEM_OVERHEAD*phrase_space memory overhead
        # the /3.0 is needed
        # because the composing data needs 3 * len(train_data) memory (arg1 vector, arg2 vector, phrase vector)
        chunk_size = int(max(arg1_space.cooccurrence_matrix.shape[0],arg2_space.cooccurrence_matrix.shape[0],len(phrase_list))
                          * self.MAX_MEM_OVERHEAD / 3.0) + 1
        
        composed_mats = []
        for i in range(int(math.ceil(len(arg1_list) / float(chunk_size)))):
            beg, end = i*chunk_size, min((i+1)*chunk_size, len(arg1_list))

            arg1_mat = arg1_space.get_rows(arg1_list[beg:end])
            arg2_mat = arg2_space.get_rows(arg2_list[beg:end])

            [arg1_mat, arg2_mat] = resolve_type_conflict([arg1_mat, arg2_mat],
                                                                    DenseMatrix)
            composed_mat = self._compose(arg1_mat, arg2_mat)
            composed_mats.append(composed_mat)
        
        composed_phrase_mat = composed_mat.nary_vstack(composed_mats)
        
        if self.composed_id2column is None:
            self.composed_id2column = self._build_id2column(arg1_space, arg2_space)

        log.print_name(logger, self, 1, "\nComposed with composition model:")
        log.print_info(logger, 3, "Composed total data points:%s" % arg1_mat.shape[0])
        log.print_matrix_info(logger, composed_phrase_mat, 4,
                              "Resulted (composed) semantic space::")
        log.print_time_info(logger, time.time(), start, 2)
        
        return Space(composed_phrase_mat, phrase_list, self.composed_id2column)
예제 #6
0
    def compose(self, data, arg_space):
        """
        Uses a lexical function composition model to compose elements.

        Args:
            data: data to be composed. List of tuples, each containing 3
            strings: (function_word, arg, composed_phrase). function_word and
            arg are the elements to be composed and composed_phrase is the
            string associated to their composition. function_word elements
            are interpreted in self.function_space.

            arg_space: argument space, of type Space. arg elements of data are
            interpreted in this space.

        Returns:
            composed space: a new object of type Space, containing the
            phrases obtained through composition.

        """
        start = time.time()

        assert_is_instance(arg_space, Space)
        arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(
            data, (self._function_space.row2id, arg_space.row2id, None))

        composed_vec_list = []
        for i in range(len(arg1_list)):
            arg1_vec = self._function_space.get_row(arg1_list[i])
            arg2_vec = arg_space.get_row(arg2_list[i])

            matrix_type = get_type_of_largest([arg1_vec, arg2_vec])
            [arg1_vec, arg2_vec] = resolve_type_conflict([arg1_vec, arg2_vec],
                                                         matrix_type)

            composed_ph_vec = self._compose(arg1_vec, arg2_vec,
                                            self._function_space.element_shape)

            composed_vec_list.append(composed_ph_vec)

        result_element_shape = self._function_space.element_shape[0:-1]
        composed_ph_mat = composed_ph_vec.nary_vstack(composed_vec_list)

        log.print_name(logger, self, 1, "\nComposed with composition model:")
        log.print_info(logger, 3,
                       "Composed total data points:%s" % len(arg1_list))
        log.print_info(
            logger, 3,
            "Functional shape of the resulted (composed) elements:%s" %
            (result_element_shape, ))
        log.print_matrix_info(logger, composed_ph_mat, 4,
                              "Resulted (composed) semantic space:")
        log.print_time_info(logger, time.time(), start, 2)

        return Space(composed_ph_mat,
                     phrase_list,
                     self.composed_id2column,
                     element_shape=result_element_shape)
예제 #7
0
 def _compose(self, arg1_mat, arg2_mat):
     #NOTE when we get in this compose arg1 mat and arg2 mat have the same type
     [mat_a_t, mat_b_t, arg1_mat
      ] = resolve_type_conflict([self._mat_a_t, self._mat_b_t, arg1_mat],
                                type(arg1_mat))
     if self._has_intercept:
         return arg1_mat * mat_a_t + padd_matrix(arg2_mat, 1) * mat_b_t
     else:
         return arg1_mat * mat_a_t + arg2_mat * mat_b_t
예제 #8
0
 def _compose(self, arg1_mat, arg2_mat):
     #NOTE when we get in this compose arg1 mat and arg2 mat have the same type
     [mat_a_t, mat_b_t, arg1_mat] = resolve_type_conflict([self._mat_a_t,
                                                           self._mat_b_t,
                                                           arg1_mat],
                                                          type(arg1_mat))
     if self._has_intercept:
         return arg1_mat * mat_a_t + padd_matrix(arg2_mat, 1) * mat_b_t
     else:
         return arg1_mat * mat_a_t + arg2_mat * mat_b_t
예제 #9
0
    def compose(self, data, arg_space):
        """
        Uses a lexical function composition model to compose elements.
        
        Args:
            data: data to be composed. List of tuples, each containing 3
            strings: (function_word, arg, composed_phrase). function_word and 
            arg are the elements to be composed and composed_phrase is the 
            string associated to their composition. function_word elements
            are interpreted in self.function_space. 
            
            arg_space: argument space, of type Space. arg elements of data are 
            interpreted in this space. 
        
        Returns:
            composed space: a new object of type Space, containing the 
            phrases obtained through composition.
            
        """
        start = time.time()
        
        assert_is_instance(arg_space, Space)
        arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(data,
                                                                     (self._function_space.row2id,
                                                                      arg_space.row2id,
                                                                      None))

        composed_vec_list = []
        for i in xrange(len(arg1_list)):
            arg1_vec = self._function_space.get_row(arg1_list[i])
            arg2_vec = arg_space.get_row(arg2_list[i])
        
            matrix_type = get_type_of_largest([arg1_vec, arg2_vec])
            [arg1_vec, arg2_vec] = resolve_type_conflict([arg1_vec, arg2_vec],
                                                              matrix_type)
                
            composed_ph_vec = self._compose(arg1_vec, arg2_vec,
                                            self._function_space.element_shape)

            composed_vec_list.append(composed_ph_vec)
        
        result_element_shape = self._function_space.element_shape[0:-1]
        composed_ph_mat = composed_ph_vec.nary_vstack(composed_vec_list)
        
        log.print_name(logger, self, 1, "\nComposed with composition model:")
        log.print_info(logger, 3, "Composed total data points:%s" % len(arg1_list))
        log.print_info(logger, 3, "Functional shape of the resulted (composed) elements:%s" 
                       % (result_element_shape,))
        log.print_matrix_info(logger, composed_ph_mat, 4, 
                              "Resulted (composed) semantic space:")
        log.print_time_info(logger, time.time(), start, 2)
        
        return Space(composed_ph_mat, phrase_list, self.composed_id2column, 
                     element_shape = result_element_shape)
예제 #10
0
    def _train(self, arg1_space, arg2_space, phrase_space, arg1_list, arg2_list, phrase_list):

        arg1_mat = arg1_space.get_rows(arg1_list)
        arg2_mat = arg2_space.get_rows(arg2_list)
        phrase_mat = phrase_space.get_rows(phrase_list)

        [arg1_mat, arg2_mat, phrase_mat] = resolve_type_conflict([arg1_mat,
                                                                  arg2_mat,
                                                                  phrase_mat],
                                                                  DenseMatrix)

        self._solve(arg1_mat, arg2_mat, phrase_mat)
예제 #11
0
    def vstack(cls, space1, space2):
        """
        Classmethod. Stacks two semantic spaces.

        The rows in the two spaces are concatenated.

        Args:
            space1, space2: spaces to be stacked, of type Space

        Returns:
            Stacked space, type Space.

        Raises:
            ValueError: if the spaces have different number of columns
                        or their columns are not identical

        """
        if space1.cooccurrence_matrix.shape[
                1] != space2.cooccurrence_matrix.shape[1]:
            raise ValueError("Inconsistent shapes: %s, %s" %
                             (space1.cooccurrence_matrix.shape[1],
                              space2.cooccurrence_matrix.shape[1]))

        if space1.id2column != space2.id2column:
            raise ValueError("Identical columns required")

        new_row2id = add_items_to_dict(space1.row2id.copy(), space2.id2row)
        new_id2row = space1.id2row + space2.id2row

        matrix_type = get_type_of_largest(
            [space1.cooccurrence_matrix, space2.cooccurrence_matrix])
        [new_mat1, new_mat2] = resolve_type_conflict(
            [space1.cooccurrence_matrix, space2.cooccurrence_matrix],
            matrix_type)

        new_mat = new_mat1.vstack(new_mat2)

        log.print_info(logger, 1, "\nVertical stack of two spaces")
        log.print_matrix_info(logger, space1.cooccurrence_matrix, 2,
                              "Semantic space 1:")
        log.print_matrix_info(logger, space2.cooccurrence_matrix, 2,
                              "Semantic space 2:")
        log.print_matrix_info(logger, new_mat, 2, "Resulted semantic space:")

        return Space(new_mat,
                     new_id2row,
                     list(space1.id2column),
                     new_row2id,
                     space1.column2id.copy(),
                     operations=[])
예제 #12
0
    def test_resolve_type_conflict(self):

        arr = np.mat([1,2])

        a = DenseMatrix(arr)
        b = SparseMatrix(arr)

        [c,d] = resolve_type_conflict([a,b], DenseMatrix)
        [e,f,g] = resolve_type_conflict([b,a,a], DenseMatrix)
        h = resolve_type_conflict([], DenseMatrix)

        [u,v] = resolve_type_conflict([arr, csr_matrix(arr)], DenseMatrix)

        self.assertIsInstance(c, DenseMatrix)
        self.assertIsInstance(d, DenseMatrix)
        self.assertIsInstance(e, DenseMatrix)
        self.assertIsInstance(f, DenseMatrix)
        self.assertIsInstance(g, DenseMatrix)
        self.assertListEqual([], h)

        self.assertIsInstance(g, DenseMatrix)

        self.assertIsInstance(u, DenseMatrix)
        self.assertIsInstance(v, DenseMatrix)
예제 #13
0
 def vstack(cls, space1, space2):
     """
     Classmethod. Stacks two semantic spaces.
     
     The rows in the two spaces are concatenated.
         
     Args:
         space1, space2: spaces to be stacked, of type Space
         
     Returns:
         Stacked space, type Space.
         
     Raises:
         ValueError: if the spaces have different number of columns
                     or their columns are not identical
         
     """
     if space1.cooccurrence_matrix.shape[1] != space2.cooccurrence_matrix.shape[1]:
         raise ValueError("Inconsistent shapes: %s, %s" 
                          % (space1.cooccurrence_matrix.shape[1], 
                             space2.cooccurrence_matrix.shape[1]))
     
     if space1.id2column != space2.id2column:
         raise ValueError("Identical columns required")
     
     new_row2id = add_items_to_dict(space1.row2id.copy(), space2.id2row)
     new_id2row = space1.id2row + space2.id2row
     
     matrix_type = get_type_of_largest([space1.cooccurrence_matrix,
                                        space2.cooccurrence_matrix])
     [new_mat1, new_mat2] = resolve_type_conflict([space1.cooccurrence_matrix, 
                                                   space2.cooccurrence_matrix],
                                                  matrix_type)
     
     new_mat = new_mat1.vstack(new_mat2)
     
     log.print_info(logger, 1, "\nVertical stack of two spaces")
     log.print_matrix_info(logger, space1.cooccurrence_matrix, 2, 
                           "Semantic space 1:")
     log.print_matrix_info(logger, space2.cooccurrence_matrix, 2, 
                           "Semantic space 2:")
     log.print_matrix_info(logger, new_mat, 2, "Resulted semantic space:")
     
     return Space(new_mat, new_id2row, list(space1.id2column), new_row2id, 
                  space1.column2id.copy(), operations=[])
예제 #14
0
    def compose(self, data, arg_space):
        """
        Uses a composition model to compose elements.

        Args:
            data: data to be composed. List of tuples, each containing 3
            strings: (arg1, arg2, composed_phrase). arg1 and arg2 are the
            elements to be composed and composed_phrase is the string associated
            to their composition.

            arg_space: argument space(s). Space object or a tuple of two
            Space objects (e.g. my_space, or (my_space1, my_space2)).
            If two spaces are provided, arg1 elements of data are
            interpreted in space1, and arg2 in space2.

        Returns:
            composed space: a new object of type Space, containing the
            phrases obtained through composition.

        """
        start = time.time()

        arg1_space, arg2_space = self.extract_arg_spaces(arg_space)
        arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(
            data, (arg1_space.row2id, arg2_space.row2id, None))

        arg1_mat = arg1_space.get_rows(arg1_list)
        arg2_mat = arg2_space.get_rows(arg2_list)

        [arg1_mat, arg2_mat] = resolve_type_conflict([arg1_mat, arg2_mat],
                                                     DenseMatrix)

        composed_phrase_mat = self._compose(arg1_mat, arg2_mat)
        if self.composed_id2column is None:
            self.composed_id2column = self._build_id2column(
                arg1_space, arg2_space)

        log.print_name(logger, self, 1, "\nComposed with composition model:")
        log.print_info(logger, 3,
                       "Composed total data points:%s" % arg1_mat.shape[0])
        log.print_matrix_info(logger, composed_phrase_mat, 4,
                              "Resulted (composed) semantic space::")
        log.print_time_info(logger, time.time(), start, 2)

        return Space(composed_phrase_mat, phrase_list, self.composed_id2column)
예제 #15
0
    def compose(self, data, arg_space):
        """
        Uses a composition model to compose elements.
        
        Args:
            data: data to be composed. List of tuples, each containing 3
            strings: (arg1, arg2, composed_phrase). arg1 and arg2 are the 
            elements to be composed and composed_phrase is the string associated
            to their composition.
            
            arg_space: argument space(s). Space object or a tuple of two 
            Space objects (e.g. my_space, or (my_space1, my_space2)). 
            If two spaces are provided, arg1 elements of data are 
            interpreted in space1, and arg2 in space2.
        
        Returns:
            composed space: a new object of type Space, containing the 
            phrases obtained through composition.
            
        """
        start = time.time()
         
        arg1_space, arg2_space = self.extract_arg_spaces(arg_space)
        arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(data,
                                                                     (arg1_space.row2id,
                                                                      arg2_space.row2id,
                                                                      None))
                                                                     
        arg1_mat = arg1_space.get_rows(arg1_list)
        arg2_mat = arg2_space.get_rows(arg2_list)
        
        [arg1_mat, arg2_mat] = resolve_type_conflict([arg1_mat, arg2_mat], DenseMatrix) 
        
        composed_phrase_mat = self._compose(arg1_mat, arg2_mat)
        if self.composed_id2column is None:
            self.composed_id2column = self._build_id2column(arg1_space, arg2_space)

        log.print_name(logger, self, 1, "\nComposed with composition model:")
        log.print_info(logger, 3, "Composed total data points:%s" % arg1_mat.shape[0])
        log.print_matrix_info(logger, composed_phrase_mat, 4, 
                              "Resulted (composed) semantic space::")
        log.print_time_info(logger, time.time(), start, 2)
                
        return Space(composed_phrase_mat, phrase_list, self.composed_id2column)
예제 #16
0
    def get_sim(self, word1, word2, similarity, space2=None):
        """
        Computes the similarity between two targets in the semantic
        space.

        If one of the two targets to be compared is not found, it returns 0..

        Args:
            word1: string
            word2: string
            similarity: of type Similarity, the similarity measure to be used
            space2: Space type, Optional. If provided, word2 is interpreted in
                this space, rather than the current space. Default, both words
                are interpreted in the current space.
        Returns:
            scalar, similarity score

        """

        assert_is_instance(similarity, Similarity)

        try:
            v1 = self.get_row(word1)
        except KeyError:
            print("Row string %s not found, returning 0.0" % (word1))
            return 0.0
        try:
            if space2 is None:
                v2 = self.get_row(word2)
            else:
                v2 = space2.get_row(word2)
        except KeyError:
            print("Row string %s not found, returning 0.0" % (word2))
            return 0.0

        [v1, v2] = resolve_type_conflict([v1, v2], DenseMatrix)
        return similarity.get_sim(v1, v2)
예제 #17
0
    def get_sim(self, word1, word2, similarity, space2=None):
        """
        Computes the similarity between two targets in the semantic 
        space.

        If one of the two targets to be compared is not found, it returns 0..
        
        Args:
            word1: string
            word2: string
            similarity: of type Similarity, the similarity measure to be used
            space2: Space type, Optional. If provided, word2 is interpreted in 
                this space, rather than the current space. Default, both words
                are interpreted in the current space.
        Returns:
            scalar, similarity score
            
        """
        
        assert_is_instance(similarity, Similarity)
        
        try:
            v1 = self.get_row(word1)
        except KeyError:
            print "Row string %s not found, returning 0.0" % (word1)
            return 0.0
        try:
            if space2 is None:
                v2 = self.get_row(word2)
            else:
                v2 = space2.get_row(word2)
        except KeyError:
            print "Row string %s not found, returning 0.0" % (word2)
            return 0.0
                
        [v1, v2] = resolve_type_conflict([v1, v2], DenseMatrix)
        return similarity.get_sim(v1, v2)
예제 #18
0
    def train(self, train_data, arg_space, phrase_space):
        """
        Trains a lexical function composition model to learn a function
        space and sets the function_space parameter. 
                
        Args:
            train_data: list of string tuples. Each tuple contains 3 
            string elements: (function_word, arg, phrase).
            
            arg_space: argument space, of type Space. arg elements of 
            train data are interpreted in this space.
        
            phrase space: phrase space, of type Space. phrase elements of 
            the train data are interpreted in this space.
            
        Training tuples which contain strings not found in their 
        respective spaces are ignored. Function words containing less than
        _MIN_SAMPLES training instances are ignored. For example, if
        _MIN_SAMPLES=2 and function word "red" occurs in only one phrase, "red"
        is ignored.
        
        The id2column attribute of the resulted composed space is set to
        be equal to that of the phrase space given as an input.
        """
        
        start = time.time()

        self._has_intercept = self._regression_learner.has_intercept()

        if not isinstance(arg_space, Space):
            raise ValueError("expected one input spaces!")  
               
        result_mats = []
               
        train_data = sorted(train_data, key=lambda tup: tup[0])
        function_word_list, arg_list, phrase_list = self.valid_data_to_lists(train_data,
                                                                             (None,
                                                                              arg_space.row2id,
                                                                              phrase_space.row2id))
        #partitions the sorted input data
        keys, key_ranges = get_partitions(function_word_list, self._MIN_SAMPLES)
        
        if not keys:
            raise ValueError("No valid training data found!")
                
        assert(len(arg_space.element_shape) == 1)
        
        if self._has_intercept:
            new_element_shape = phrase_space.element_shape + (arg_space.element_shape[0] + 1,)
        else:
            new_element_shape = phrase_space.element_shape + (arg_space.element_shape[0],)
            
        for i in xrange(len(key_ranges)):
            
            idx_beg, idx_end = key_ranges[i]
            
            print ("Training lexical function...%s with %d samples" 
                     % (keys[i], idx_end - idx_beg))
                            
            arg_mat = arg_space.get_rows(arg_list[idx_beg:idx_end]) 
            phrase_mat = phrase_space.get_rows(phrase_list[idx_beg:idx_end])
 
            #convert them to the same type
            matrix_type = get_type_of_largest([arg_mat, phrase_mat])
            [arg_mat, phrase_mat] = resolve_type_conflict([arg_mat, phrase_mat],
                                                          matrix_type)

            result_mat = self._regression_learner.train(arg_mat, phrase_mat).transpose()
            
            result_mat.reshape((1, np.prod(new_element_shape)))
            
            result_mats.append(result_mat)

        new_space_mat = arg_mat.nary_vstack(result_mats)
        
        self.composed_id2column = phrase_space.id2column
            
        self._function_space = Space(new_space_mat, keys, [], 
                                     element_shape=new_element_shape)
        
        log.print_composition_model_info(logger, self, 1, "\nTrained composition model:")
        log.print_info(logger, 3, "Trained: %s lexical functions" % len(keys))
        log.print_info(logger, 3, "With total data points:%s" % len(function_word_list))
        log.print_matrix_info(logger, arg_space.cooccurrence_matrix, 3, 
                              "Semantic space of arguments:")
        log.print_info(logger, 3, "Shape of lexical functions learned:%s" 
                       % (new_element_shape,))
        log.print_matrix_info(logger, new_space_mat, 3, 
                              "Semantic space of lexical functions:")
        log.print_time_info(logger, time.time(), start, 2)
예제 #19
0
    def train(self, train_data, arg_space, phrase_space):
        """
        Trains a lexical function composition model to learn a function
        space and sets the function_space parameter.

        Args:
            train_data: list of string tuples. Each tuple contains 3
            string elements: (function_word, arg, phrase).

            arg_space: argument space, of type Space. arg elements of
            train data are interpreted in this space.

            phrase space: phrase space, of type Space. phrase elements of
            the train data are interpreted in this space.

        Training tuples which contain strings not found in their
        respective spaces are ignored. Function words containing less than
        _MIN_SAMPLES training instances are ignored. For example, if
        _MIN_SAMPLES=2 and function word "red" occurs in only one phrase, "red"
        is ignored.

        The id2column attribute of the resulted composed space is set to
        be equal to that of the phrase space given as an input.
        """

        start = time.time()

        self._has_intercept = self._regression_learner.has_intercept()

        if not isinstance(arg_space, Space):
            raise ValueError("expected one input spaces!")

        result_mats = []

        train_data = sorted(train_data, key=lambda tup: tup[0])
        function_word_list, arg_list, phrase_list = self.valid_data_to_lists(
            train_data, (None, arg_space.row2id, phrase_space.row2id))
        #partitions the sorted input data
        keys, key_ranges = get_partitions(function_word_list,
                                          self._MIN_SAMPLES)

        if not keys:
            raise ValueError("No valid training data found!")

        assert (len(arg_space.element_shape) == 1)

        if self._has_intercept:
            new_element_shape = phrase_space.element_shape + (
                arg_space.element_shape[0] + 1, )
        else:
            new_element_shape = phrase_space.element_shape + (
                arg_space.element_shape[0], )

        for i in range(len(key_ranges)):
            idx_beg, idx_end = key_ranges[i]

            print(("Training lexical function...%s with %d samples" %
                   (keys[i], idx_end - idx_beg)))

            arg_mat = arg_space.get_rows(arg_list[idx_beg:idx_end])
            phrase_mat = phrase_space.get_rows(phrase_list[idx_beg:idx_end])

            #convert them to the same type
            matrix_type = get_type_of_largest([arg_mat, phrase_mat])
            [arg_mat,
             phrase_mat] = resolve_type_conflict([arg_mat, phrase_mat],
                                                 matrix_type)

            result_mat = self._regression_learner.train(
                arg_mat, phrase_mat).transpose()

            result_mat.reshape((1, np.prod(new_element_shape)))

            result_mats.append(result_mat)

        new_space_mat = arg_mat.nary_vstack(result_mats)

        self.composed_id2column = phrase_space.id2column

        self._function_space = Space(new_space_mat,
                                     keys, [],
                                     element_shape=new_element_shape)

        log.print_composition_model_info(logger, self, 1,
                                         "\nTrained composition model:")
        log.print_info(logger, 3, "Trained: %s lexical functions" % len(keys))
        log.print_info(logger, 3,
                       "With total data points:%s" % len(function_word_list))
        log.print_matrix_info(logger, arg_space.cooccurrence_matrix, 3,
                              "Semantic space of arguments:")
        log.print_info(
            logger, 3,
            "Shape of lexical functions learned:%s" % (new_element_shape, ))
        log.print_matrix_info(logger, new_space_mat, 3,
                              "Semantic space of lexical functions:")
        log.print_time_info(logger, time.time(), start, 2)