示例#1
0
    def apply(self, transformation):
        """
        Applies a transformation on the current space.

        All transformations affect the data matrix. If the transformation
        reduces the dimensionality of the space, the column indexing
        structures are also updated. The operation applied is appended
        to the list of operations that the space holds.

        Args:
            transformation: of type Scaling, DimensionalityReduction or
              FeatureSelection

        Returns:
            A new space on which the transformation has been applied.

        """
        start = time.time()
        #TODO , FeatureSelection, DimReduction ..
        assert_is_instance(
            transformation,
            (Scaling, DimensionalityReduction, FeatureSelection))
        op = transformation.create_operation()
        new_matrix = op.apply(self.cooccurrence_matrix)

        new_operations = list(self.operations)
        new_operations.append(op)

        id2row, row2id = list(self.id2row), self.row2id.copy()

        if isinstance(op, DimensionalityReductionOperation):
            self.assert_1dim_element()
            id2column, column2id = [], {}
        elif isinstance(op, FeatureSelectionOperation):
            self.assert_1dim_element()
            op.original_columns = self.id2column

            if op.original_columns:
                id2column = list(
                    array(op.original_columns)[op.selected_columns])
                column2id = list2dict(id2column)
            else:
                id2column, column2id = [], {}
        else:
            id2column, column2id = list(self.id2column), self.column2id.copy()

        log.print_transformation_info(logger, transformation, 1,
                                      "\nApplied transformation:")
        log.print_matrix_info(logger, self.cooccurrence_matrix, 2,
                              "Original semantic space:")
        log.print_matrix_info(logger, new_matrix, 2,
                              "Resulted semantic space:")
        log.print_time_info(logger, time.time(), start, 2)

        return Space(new_matrix,
                     id2row,
                     id2column,
                     row2id,
                     column2id,
                     operations=new_operations)
示例#2
0
def load(file_name, data_type=None):
    with open(file_name, 'rb') as f:
        result = pickle.load(f)

    if not data_type is None:
        assert_is_instance(result, data_type)

    return result
示例#3
0
def load(file_name, data_type=None):
    with open(file_name) as f:
        result = pickle.load(f)

    if not data_type is None:
        assert_is_instance(result, data_type)

    return result
    def compose(self, data, arg_space):
        """
        Uses a lexical function composition model to compose elements.

        Args:
            data: data to be composed. List of tuples, each containing 3
            strings: (function_word, arg, composed_phrase). function_word and
            arg are the elements to be composed and composed_phrase is the
            string associated to their composition. function_word elements
            are interpreted in self.function_space.

            arg_space: argument space, of type Space. arg elements of data are
            interpreted in this space.

        Returns:
            composed space: a new object of type Space, containing the
            phrases obtained through composition.

        """
        start = time.time()

        assert_is_instance(arg_space, Space)
        arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(
            data, (self._function_space.row2id, arg_space.row2id, None))

        composed_vec_list = []
        for i in range(len(arg1_list)):
            arg1_vec = self._function_space.get_row(arg1_list[i])
            arg2_vec = arg_space.get_row(arg2_list[i])

            matrix_type = get_type_of_largest([arg1_vec, arg2_vec])
            [arg1_vec, arg2_vec] = resolve_type_conflict([arg1_vec, arg2_vec],
                                                         matrix_type)

            composed_ph_vec = self._compose(arg1_vec, arg2_vec,
                                            self._function_space.element_shape)

            composed_vec_list.append(composed_ph_vec)

        result_element_shape = self._function_space.element_shape[0:-1]
        composed_ph_mat = composed_ph_vec.nary_vstack(composed_vec_list)

        log.print_name(logger, self, 1, "\nComposed with composition model:")
        log.print_info(logger, 3,
                       "Composed total data points:%s" % len(arg1_list))
        log.print_info(
            logger, 3,
            "Functional shape of the resulted (composed) elements:%s" %
            (result_element_shape, ))
        log.print_matrix_info(logger, composed_ph_mat, 4,
                              "Resulted (composed) semantic space:")
        log.print_time_info(logger, time.time(), start, 2)

        return Space(composed_ph_mat,
                     phrase_list,
                     self.composed_id2column,
                     element_shape=result_element_shape)
示例#5
0
    def apply(self, transformation):
        """
        Applies a transformation on the current space.
        
        All transformations affect the data matrix. If the transformation 
        reduces the dimensionality of the space, the column indexing
        structures are also updated. The operation applied is appended
        to the list of operations that the space holds. 
        
        Args:
            transformation: of type Scaling, DimensionalityReduction or 
              FeatureSelection  

        Returns:
            A new space on which the transformation has been applied.
            
        """
        start = time.time()
        #TODO , FeatureSelection, DimReduction ..
        assert_is_instance(transformation, (Scaling, DimensionalityReduction, 
                                            FeatureSelection))
        op = transformation.create_operation()
        new_matrix =  op.apply(self.cooccurrence_matrix)
        
        new_operations = list(self.operations)
        new_operations.append(op)

        id2row, row2id = list(self.id2row), self.row2id.copy() 
        
        
        if isinstance(op, DimensionalityReductionOperation):
            self.assert_1dim_element()
            id2column, column2id = [], {}
        elif isinstance(op, FeatureSelectionOperation):
            self.assert_1dim_element()
            op.original_columns = self.id2column
            
            if op.original_columns: 
                id2column = list(array(op.original_columns)[op.selected_columns])
                column2id = list2dict(id2column)
            else:
                id2column, column2id = [],{}
        else:
            id2column, column2id = list(self.id2column), self.column2id.copy()

        log.print_transformation_info(logger, transformation, 1, 
                                      "\nApplied transformation:")
        log.print_matrix_info(logger, self.cooccurrence_matrix, 2, 
                              "Original semantic space:")
        log.print_matrix_info(logger, new_matrix, 2, "Resulted semantic space:")
        log.print_time_info(logger, time.time(), start, 2)
                        
        return Space(new_matrix, id2row, id2column,
                     row2id, column2id, operations = new_operations)
    def compose(self, data, arg_space):
        """
        Uses a lexical function composition model to compose elements.
        
        Args:
            data: data to be composed. List of tuples, each containing 3
            strings: (function_word, arg, composed_phrase). function_word and 
            arg are the elements to be composed and composed_phrase is the 
            string associated to their composition. function_word elements
            are interpreted in self.function_space. 
            
            arg_space: argument space, of type Space. arg elements of data are 
            interpreted in this space. 
        
        Returns:
            composed space: a new object of type Space, containing the 
            phrases obtained through composition.
            
        """
        start = time.time()
        
        assert_is_instance(arg_space, Space)
        arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(data,
                                                                     (self._function_space.row2id,
                                                                      arg_space.row2id,
                                                                      None))

        composed_vec_list = []
        for i in xrange(len(arg1_list)):
            arg1_vec = self._function_space.get_row(arg1_list[i])
            arg2_vec = arg_space.get_row(arg2_list[i])
        
            matrix_type = get_type_of_largest([arg1_vec, arg2_vec])
            [arg1_vec, arg2_vec] = resolve_type_conflict([arg1_vec, arg2_vec],
                                                              matrix_type)
                
            composed_ph_vec = self._compose(arg1_vec, arg2_vec,
                                            self._function_space.element_shape)

            composed_vec_list.append(composed_ph_vec)
        
        result_element_shape = self._function_space.element_shape[0:-1]
        composed_ph_mat = composed_ph_vec.nary_vstack(composed_vec_list)
        
        log.print_name(logger, self, 1, "\nComposed with composition model:")
        log.print_info(logger, 3, "Composed total data points:%s" % len(arg1_list))
        log.print_info(logger, 3, "Functional shape of the resulted (composed) elements:%s" 
                       % (result_element_shape,))
        log.print_matrix_info(logger, composed_ph_mat, 4, 
                              "Resulted (composed) semantic space:")
        log.print_time_info(logger, time.time(), start, 2)
        
        return Space(composed_ph_mat, phrase_list, self.composed_id2column, 
                     element_shape = result_element_shape)
示例#7
0
    def get_neighbours(self, word, no_neighbours, similarity, 
                       space2=None):            
        """
        Computes the neighbours of a word in the semantic space.

        Args:
            word: string, target word
            no_neighbours: int, the number of neighbours desired
            similarity: of type Similarity, the similarity measure to be used
            space2: Space type, Optional. If provided, the neighbours are 
                retrieved from this space, rather than the current space. 
                Default, neighbours are retrieved from the current space.
                
        Returns:
            list of (neighbour_string, similarity_value) tuples.
            
        Raises:
            KeyError: if the word is not found in the semantic space.
            
        """
        
        start = time.time()
        assert_is_instance(similarity, Similarity)       
        vector = self.get_row(word)
        
        if space2 is None:
            id2row = self.id2row
            sims_to_matrix = similarity.get_sims_to_matrix(vector, 
                                                          self.cooccurrence_matrix)
        else:
            mat_type = type(space2.cooccurrence_matrix)
            if not isinstance(vector, mat_type):
                vector = mat_type(vector)
            
            sims_to_matrix = similarity.get_sims_to_matrix(vector, 
                                         space2.cooccurrence_matrix)
            id2row = space2.id2row 
        
        sorted_perm = sims_to_matrix.sorted_permutation(sims_to_matrix.sum, 1)
        no_neighbours = min(no_neighbours, len(id2row))
        result = []
                
        for count in range(no_neighbours):
            i = sorted_perm[count]
            result.append((id2row[i], sims_to_matrix[i,0]))

        log.print_info(logger, 1, "\nGetting neighbours of:%s" % (word))
        log.print_name(logger, similarity, 1, "Similarity:")
        log.print_time_info(logger, time.time(), start, 2)
        return result    
示例#8
0
    def get_neighbours(self, word, no_neighbours, similarity, space2=None):
        """
        Computes the neighbours of a word in the semantic space.

        Args:
            word: string, target word
            no_neighbours: int, the number of neighbours desired
            similarity: of type Similarity, the similarity measure to be used
            space2: Space type, Optional. If provided, the neighbours are
                retrieved from this space, rather than the current space.
                Default, neighbours are retrieved from the current space.

        Returns:
            list of (neighbour_string, similarity_value) tuples.

        Raises:
            KeyError: if the word is not found in the semantic space.

        """

        start = time.time()
        assert_is_instance(similarity, Similarity)
        vector = self.get_row(word)

        if space2 is None:
            id2row = self.id2row
            sims_to_matrix = similarity.get_sims_to_matrix(
                vector, self.cooccurrence_matrix)
        else:
            mat_type = type(space2.cooccurrence_matrix)
            if not isinstance(vector, mat_type):
                vector = mat_type(vector)

            sims_to_matrix = similarity.get_sims_to_matrix(
                vector, space2.cooccurrence_matrix)
            id2row = space2.id2row

        sorted_perm = sims_to_matrix.sorted_permutation(sims_to_matrix.sum, 1)
        no_neighbours = min(no_neighbours, len(id2row))
        result = []

        for count in range(no_neighbours):
            i = sorted_perm[count]
            result.append((id2row[i], sims_to_matrix[i, 0]))

        log.print_info(logger, 1, "\nGetting neighbours of:%s" % (word))
        log.print_name(logger, similarity, 1, "Similarity:")
        log.print_time_info(logger, time.time(), start, 2)
        return result
示例#9
0
    def get_rows(self, words):
        """
        Returns the sub-matrix corresponding to a list of words.

        Args:
            words: list of strings

        Returns: Matrix type (of shape (len(words), no_cols)),
                 the sub-matrix containing the words given as an input.

        Raises:
            KeyError: if one of words is not found in the space
        """
        assert_is_instance(words, list)
        row_ids = []
        for word in words:
            row_ids.append(self.row2id[word])

        return self.cooccurrence_matrix[row_ids, :]
示例#10
0
    def get_rows(self, words):
        """
        Returns the sub-matrix corresponding to a list of words.
        
        Args:
            words: list of strings

        Returns: Matrix type (of shape (len(words), no_cols)),
                 the sub-matrix containing the words given as an input.
        
        Raises:
            KeyError: if one of words is not found in the space
        """
        assert_is_instance(words, list)
        row_ids = []
        for word in words:
            row_ids.append(self.row2id[word])
        
        return self.cooccurrence_matrix[row_ids,:]
示例#11
0
    def extract_arg_spaces(cls, arg_space):
        """
        TO BE MOVED TO A UTILS MODULE!
        """
        if not isinstance(arg_space, tuple):
            arg1_space = arg_space
            arg2_space = arg_space
        else:
            if len(arg_space) != 2:
                raise ValueError("expected two spaces, received %d-ary tuple "
                                 % len(arg_space))
            arg1_space, arg2_space = arg_space

        assert_is_instance(arg1_space, Space)
        assert_is_instance(arg2_space, Space)

        cls._assert_space_match(arg1_space, arg2_space)

        return arg1_space, arg2_space
示例#12
0
    def get_sim(self, word1, word2, similarity, space2=None):
        """
        Computes the similarity between two targets in the semantic
        space.

        If one of the two targets to be compared is not found, it returns 0..

        Args:
            word1: string
            word2: string
            similarity: of type Similarity, the similarity measure to be used
            space2: Space type, Optional. If provided, word2 is interpreted in
                this space, rather than the current space. Default, both words
                are interpreted in the current space.
        Returns:
            scalar, similarity score

        """

        assert_is_instance(similarity, Similarity)

        try:
            v1 = self.get_row(word1)
        except KeyError:
            print("Row string %s not found, returning 0.0" % (word1))
            return 0.0
        try:
            if space2 is None:
                v2 = self.get_row(word2)
            else:
                v2 = space2.get_row(word2)
        except KeyError:
            print("Row string %s not found, returning 0.0" % (word2))
            return 0.0

        [v1, v2] = resolve_type_conflict([v1, v2], DenseMatrix)
        return similarity.get_sim(v1, v2)
示例#13
0
    def get_sim(self, word1, word2, similarity, space2=None):
        """
        Computes the similarity between two targets in the semantic 
        space.

        If one of the two targets to be compared is not found, it returns 0..
        
        Args:
            word1: string
            word2: string
            similarity: of type Similarity, the similarity measure to be used
            space2: Space type, Optional. If provided, word2 is interpreted in 
                this space, rather than the current space. Default, both words
                are interpreted in the current space.
        Returns:
            scalar, similarity score
            
        """
        
        assert_is_instance(similarity, Similarity)
        
        try:
            v1 = self.get_row(word1)
        except KeyError:
            print "Row string %s not found, returning 0.0" % (word1)
            return 0.0
        try:
            if space2 is None:
                v2 = self.get_row(word2)
            else:
                v2 = space2.get_row(word2)
        except KeyError:
            print "Row string %s not found, returning 0.0" % (word2)
            return 0.0
                
        [v1, v2] = resolve_type_conflict([v1, v2], DenseMatrix)
        return similarity.get_sim(v1, v2)
示例#14
0
 def __init__(self, core_space, matrix_, id2row, row2id=None):
     """
     Constructor.
     
     Args:
         core_space: Space type, the core space that this is peripheral to.
         matrix_: Matrix type, the data matrix of the space
         id2row: list, the row elements
         row2id: dictionary, maps row strings to ids. Optional, built from 
             id2row by default.
          
     Returns:
          A peripheral semantic space (type PeripheralSpace) on which the 
          core space operations have been projected. Column indexing structures 
          and operations are taken over from the core space.
     
     Raises:
         TypeError: if matrix_ or core_space are not of the correct type
         ValueError: if element shape is not consistent with 
                      the size of matrix rows
                     if the matrix and the provided row and column 
                      indexing structures are not of consistent shapes.
     """
     assert_is_instance(matrix_, Matrix)
     assert_is_instance(core_space, Space)
     assert_is_instance(id2row, list)
     # TODO: assert it is not a peripheral space here!
     
     if row2id is None:
         row2id = list2dict(id2row)
     else:
         assert_dict_match_list(row2id, id2row)    
         
     column2id = core_space.column2id
     id2column = core_space.id2column
     
     self._operations = list(core_space.operations)    
     self._row2id = row2id
     self._id2row = id2row
     self._column2id = column2id
     self._id2column = id2column
     
     self._cooccurrence_matrix = self._project_core_operations(matrix_)
     assert_shape_consistent(self.cooccurrence_matrix, self._id2row,
                              self._id2column, self._row2id, self._column2id)
     
     self._element_shape = (self._cooccurrence_matrix.shape[1],)
    def __init__(self, core_space, matrix_, id2row, row2id=None):
        """
        Constructor.

        Args:
            core_space: Space type, the core space that this is peripheral to.
            matrix_: Matrix type, the data matrix of the space
            id2row: list, the row elements
            row2id: dictionary, maps row strings to ids. Optional, built from
                id2row by default.

        Returns:
             A peripheral semantic space (type PeripheralSpace) on which the
             core space operations have been projected. Column indexing structures
             and operations are taken over from the core space.

        Raises:
            TypeError: if matrix_ or core_space are not of the correct type
            ValueError: if element shape is not consistent with
                         the size of matrix rows
                        if the matrix and the provided row and column
                         indexing structures are not of consistent shapes.
        """
        assert_is_instance(matrix_, Matrix)
        assert_is_instance(core_space, Space)
        assert_is_instance(id2row, list)
        # TODO: assert it is not a peripheral space here!

        if row2id is None:
            row2id = list2dict(id2row)
        else:
            assert_dict_match_list(row2id, id2row)

        column2id = core_space.column2id
        id2column = core_space.id2column

        self._operations = list(core_space.operations)
        self._row2id = row2id
        self._id2row = id2row
        self._column2id = column2id
        self._id2column = id2column

        self._cooccurrence_matrix = self._project_core_operations(matrix_)
        assert_shape_consistent(self.cooccurrence_matrix, self._id2row,
                                 self._id2column, self._row2id, self._column2id)

        self._element_shape = (self._cooccurrence_matrix.shape[1],)
示例#16
0
 def set_regression_learner(self, regression_learner):
     assert_is_instance(regression_learner, RegressionLearner)
     self._regression_learner = regression_learner
示例#17
0
 def set_regression_learner(self, regression_learner):
     assert_is_instance(regression_learner, RegressionLearner)
     self._regression_learner = regression_learner
示例#18
0
 def set_cooccurrence_matrix(self, matrix_):
     assert_is_instance(matrix_, Matrix)
     assert_shape_consistent(matrix_, self.row2id, self.id2row,
                             self.column2id, self.id2column)
     self._cooccurrence_matrix = matrix_
示例#19
0
    def __init__(self, matrix_, id2row, id2column, row2id=None, column2id=None,
                 **kwargs):
        """
        Constructor.
        
        Args:
            matrix_: Matrix type, the data matrix of the space
            id2row: list, the row elements
            id2column: list, the column elements
            row2id: dictionary, maps row strings to ids. Optional, built from 
                id2row by default.
            column2id: dictionary, maps col strings to ids. Optional, built
                from id2column by default
            operations: list of operations already performed on the input
                matrix, Optional, by default set to empty.
            element_shape: tuple of int, the shape on row elements. Optional, 
                by default row elements are one-dimensional and element_shape is
                (no_cols, ). Used in 3D composition.
             
         Returns:
             A semantic space (type Space)
             
         Raises:
             TypeError: if matrix_ is not of the correct type
             ValueError: if element shape is not consistent with 
                         the size of matrix rows
                         if the matrix and the provided row and column 
                         indexing structures are not of consistent shapes.
                 
        """
        assert_is_instance(matrix_, Matrix)
        assert_valid_kwargs(kwargs, ["operations", "element_shape"])
        assert_is_instance(id2row, list)
        assert_is_instance(id2column, list)
        
        if row2id is None:
            row2id = list2dict(id2row)
        else:    
            assert_dict_match_list(row2id, id2row)
            
        if column2id is None:
            column2id = list2dict(id2column)
        else:
            assert_dict_match_list(column2id, id2column)
            
        assert_shape_consistent(matrix_, id2row, id2column, row2id, column2id)
        
        self._cooccurrence_matrix = matrix_
        self._row2id = row2id
        self._id2row = id2row
        self._column2id = column2id
        self._id2column = id2column
        if "operations" in kwargs:
            self._operations = kwargs["operations"]
        else:
            self._operations = []

        if "element_shape" in kwargs:
            elem_shape = kwargs["element_shape"]
            if prod(elem_shape) != self._cooccurrence_matrix.shape[1]:
                raise ValueError("Trying to assign invalid element shape:\
                                    element_shape: %s, matrix columns: %s" 
                                    % (str(elem_shape), 
                                       str(self._cooccurrence_matrix.shape[1])))
          
        # NOTE: watch out here, can cause bugs, if we change the dimension 
        # of a regular space and we do not create a new space         
            self._element_shape = kwargs["element_shape"]
        else:    
            self._element_shape = (self._cooccurrence_matrix.shape[1],)    
示例#20
0
 def set_cooccurrence_matrix(self, matrix_):
     assert_is_instance(matrix_, Matrix)
     assert_shape_consistent(matrix_, self.row2id, self.id2row,
                                    self.column2id, self.id2column)
     self._cooccurrence_matrix = matrix_
示例#21
0
    def __init__(self,
                 matrix_,
                 id2row,
                 id2column,
                 row2id=None,
                 column2id=None,
                 operations=[],
                 element_shape=None):
        """
        Constructor.

        Args:
            matrix_: Matrix type, the data matrix of the space
            id2row: list, the row elements
            id2column: list, the column elements
            row2id: dictionary, maps row strings to ids. Optional, built from
                id2row by default.
            column2id: dictionary, maps col strings to ids. Optional, built
                from id2column by default
            operations: list of operations already performed on the input
                matrix, Optional, by default set to empty.
            element_shape: tuple of int, the shape on row elements. Optional,
                by default row elements are one-dimensional and element_shape is
                (no_cols, ). Used in 3D composition.

         Returns:
             A semantic space (type Space)

         Raises:
             TypeError: if matrix_ is not of the correct type
             ValueError: if element shape is not consistent with
                         the size of matrix rows
                         if the matrix and the provided row and column
                         indexing structures are not of consistent shapes.

        """
        assert_is_instance(matrix_, Matrix)
        assert_is_instance(id2row, list)
        assert_is_instance(id2column, list)

        if row2id is None:
            row2id = list2dict(id2row)
        else:
            assert_dict_match_list(row2id, id2row)

        if column2id is None:
            column2id = list2dict(id2column)
        else:
            assert_dict_match_list(column2id, id2column)

        assert_shape_consistent(matrix_, id2row, id2column, row2id, column2id)

        self._cooccurrence_matrix = matrix_
        self._row2id = row2id
        self._id2row = id2row
        self._column2id = column2id
        self._id2column = id2column
        self._operations = operations

        if element_shape:
            if prod(element_shape) != self._cooccurrence_matrix.shape[1]:
                raise ValueError("Trying to assign invalid element shape:\
                                    element_shape: %s, matrix columns: %s" %
                                 (str(element_shape),
                                  str(self._cooccurrence_matrix.shape[1])))

        # NOTE: watch out here, can cause bugs, if we change the dimension
        # of a regular space and we do not create a new space
            self._element_shape = element_shape
        else:
            self._element_shape = (self._cooccurrence_matrix.shape[1], )