def apply(self, transformation): """ Applies a transformation on the current space. All transformations affect the data matrix. If the transformation reduces the dimensionality of the space, the column indexing structures are also updated. The operation applied is appended to the list of operations that the space holds. Args: transformation: of type Scaling, DimensionalityReduction or FeatureSelection Returns: A new space on which the transformation has been applied. """ start = time.time() #TODO , FeatureSelection, DimReduction .. assert_is_instance( transformation, (Scaling, DimensionalityReduction, FeatureSelection)) op = transformation.create_operation() new_matrix = op.apply(self.cooccurrence_matrix) new_operations = list(self.operations) new_operations.append(op) id2row, row2id = list(self.id2row), self.row2id.copy() if isinstance(op, DimensionalityReductionOperation): self.assert_1dim_element() id2column, column2id = [], {} elif isinstance(op, FeatureSelectionOperation): self.assert_1dim_element() op.original_columns = self.id2column if op.original_columns: id2column = list( array(op.original_columns)[op.selected_columns]) column2id = list2dict(id2column) else: id2column, column2id = [], {} else: id2column, column2id = list(self.id2column), self.column2id.copy() log.print_transformation_info(logger, transformation, 1, "\nApplied transformation:") log.print_matrix_info(logger, self.cooccurrence_matrix, 2, "Original semantic space:") log.print_matrix_info(logger, new_matrix, 2, "Resulted semantic space:") log.print_time_info(logger, time.time(), start, 2) return Space(new_matrix, id2row, id2column, row2id, column2id, operations=new_operations)
def load(file_name, data_type=None): with open(file_name, 'rb') as f: result = pickle.load(f) if not data_type is None: assert_is_instance(result, data_type) return result
def load(file_name, data_type=None): with open(file_name) as f: result = pickle.load(f) if not data_type is None: assert_is_instance(result, data_type) return result
def compose(self, data, arg_space): """ Uses a lexical function composition model to compose elements. Args: data: data to be composed. List of tuples, each containing 3 strings: (function_word, arg, composed_phrase). function_word and arg are the elements to be composed and composed_phrase is the string associated to their composition. function_word elements are interpreted in self.function_space. arg_space: argument space, of type Space. arg elements of data are interpreted in this space. Returns: composed space: a new object of type Space, containing the phrases obtained through composition. """ start = time.time() assert_is_instance(arg_space, Space) arg1_list, arg2_list, phrase_list = self.valid_data_to_lists( data, (self._function_space.row2id, arg_space.row2id, None)) composed_vec_list = [] for i in range(len(arg1_list)): arg1_vec = self._function_space.get_row(arg1_list[i]) arg2_vec = arg_space.get_row(arg2_list[i]) matrix_type = get_type_of_largest([arg1_vec, arg2_vec]) [arg1_vec, arg2_vec] = resolve_type_conflict([arg1_vec, arg2_vec], matrix_type) composed_ph_vec = self._compose(arg1_vec, arg2_vec, self._function_space.element_shape) composed_vec_list.append(composed_ph_vec) result_element_shape = self._function_space.element_shape[0:-1] composed_ph_mat = composed_ph_vec.nary_vstack(composed_vec_list) log.print_name(logger, self, 1, "\nComposed with composition model:") log.print_info(logger, 3, "Composed total data points:%s" % len(arg1_list)) log.print_info( logger, 3, "Functional shape of the resulted (composed) elements:%s" % (result_element_shape, )) log.print_matrix_info(logger, composed_ph_mat, 4, "Resulted (composed) semantic space:") log.print_time_info(logger, time.time(), start, 2) return Space(composed_ph_mat, phrase_list, self.composed_id2column, element_shape=result_element_shape)
def apply(self, transformation): """ Applies a transformation on the current space. All transformations affect the data matrix. If the transformation reduces the dimensionality of the space, the column indexing structures are also updated. The operation applied is appended to the list of operations that the space holds. Args: transformation: of type Scaling, DimensionalityReduction or FeatureSelection Returns: A new space on which the transformation has been applied. """ start = time.time() #TODO , FeatureSelection, DimReduction .. assert_is_instance(transformation, (Scaling, DimensionalityReduction, FeatureSelection)) op = transformation.create_operation() new_matrix = op.apply(self.cooccurrence_matrix) new_operations = list(self.operations) new_operations.append(op) id2row, row2id = list(self.id2row), self.row2id.copy() if isinstance(op, DimensionalityReductionOperation): self.assert_1dim_element() id2column, column2id = [], {} elif isinstance(op, FeatureSelectionOperation): self.assert_1dim_element() op.original_columns = self.id2column if op.original_columns: id2column = list(array(op.original_columns)[op.selected_columns]) column2id = list2dict(id2column) else: id2column, column2id = [],{} else: id2column, column2id = list(self.id2column), self.column2id.copy() log.print_transformation_info(logger, transformation, 1, "\nApplied transformation:") log.print_matrix_info(logger, self.cooccurrence_matrix, 2, "Original semantic space:") log.print_matrix_info(logger, new_matrix, 2, "Resulted semantic space:") log.print_time_info(logger, time.time(), start, 2) return Space(new_matrix, id2row, id2column, row2id, column2id, operations = new_operations)
def compose(self, data, arg_space): """ Uses a lexical function composition model to compose elements. Args: data: data to be composed. List of tuples, each containing 3 strings: (function_word, arg, composed_phrase). function_word and arg are the elements to be composed and composed_phrase is the string associated to their composition. function_word elements are interpreted in self.function_space. arg_space: argument space, of type Space. arg elements of data are interpreted in this space. Returns: composed space: a new object of type Space, containing the phrases obtained through composition. """ start = time.time() assert_is_instance(arg_space, Space) arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(data, (self._function_space.row2id, arg_space.row2id, None)) composed_vec_list = [] for i in xrange(len(arg1_list)): arg1_vec = self._function_space.get_row(arg1_list[i]) arg2_vec = arg_space.get_row(arg2_list[i]) matrix_type = get_type_of_largest([arg1_vec, arg2_vec]) [arg1_vec, arg2_vec] = resolve_type_conflict([arg1_vec, arg2_vec], matrix_type) composed_ph_vec = self._compose(arg1_vec, arg2_vec, self._function_space.element_shape) composed_vec_list.append(composed_ph_vec) result_element_shape = self._function_space.element_shape[0:-1] composed_ph_mat = composed_ph_vec.nary_vstack(composed_vec_list) log.print_name(logger, self, 1, "\nComposed with composition model:") log.print_info(logger, 3, "Composed total data points:%s" % len(arg1_list)) log.print_info(logger, 3, "Functional shape of the resulted (composed) elements:%s" % (result_element_shape,)) log.print_matrix_info(logger, composed_ph_mat, 4, "Resulted (composed) semantic space:") log.print_time_info(logger, time.time(), start, 2) return Space(composed_ph_mat, phrase_list, self.composed_id2column, element_shape = result_element_shape)
def get_neighbours(self, word, no_neighbours, similarity, space2=None): """ Computes the neighbours of a word in the semantic space. Args: word: string, target word no_neighbours: int, the number of neighbours desired similarity: of type Similarity, the similarity measure to be used space2: Space type, Optional. If provided, the neighbours are retrieved from this space, rather than the current space. Default, neighbours are retrieved from the current space. Returns: list of (neighbour_string, similarity_value) tuples. Raises: KeyError: if the word is not found in the semantic space. """ start = time.time() assert_is_instance(similarity, Similarity) vector = self.get_row(word) if space2 is None: id2row = self.id2row sims_to_matrix = similarity.get_sims_to_matrix(vector, self.cooccurrence_matrix) else: mat_type = type(space2.cooccurrence_matrix) if not isinstance(vector, mat_type): vector = mat_type(vector) sims_to_matrix = similarity.get_sims_to_matrix(vector, space2.cooccurrence_matrix) id2row = space2.id2row sorted_perm = sims_to_matrix.sorted_permutation(sims_to_matrix.sum, 1) no_neighbours = min(no_neighbours, len(id2row)) result = [] for count in range(no_neighbours): i = sorted_perm[count] result.append((id2row[i], sims_to_matrix[i,0])) log.print_info(logger, 1, "\nGetting neighbours of:%s" % (word)) log.print_name(logger, similarity, 1, "Similarity:") log.print_time_info(logger, time.time(), start, 2) return result
def get_neighbours(self, word, no_neighbours, similarity, space2=None): """ Computes the neighbours of a word in the semantic space. Args: word: string, target word no_neighbours: int, the number of neighbours desired similarity: of type Similarity, the similarity measure to be used space2: Space type, Optional. If provided, the neighbours are retrieved from this space, rather than the current space. Default, neighbours are retrieved from the current space. Returns: list of (neighbour_string, similarity_value) tuples. Raises: KeyError: if the word is not found in the semantic space. """ start = time.time() assert_is_instance(similarity, Similarity) vector = self.get_row(word) if space2 is None: id2row = self.id2row sims_to_matrix = similarity.get_sims_to_matrix( vector, self.cooccurrence_matrix) else: mat_type = type(space2.cooccurrence_matrix) if not isinstance(vector, mat_type): vector = mat_type(vector) sims_to_matrix = similarity.get_sims_to_matrix( vector, space2.cooccurrence_matrix) id2row = space2.id2row sorted_perm = sims_to_matrix.sorted_permutation(sims_to_matrix.sum, 1) no_neighbours = min(no_neighbours, len(id2row)) result = [] for count in range(no_neighbours): i = sorted_perm[count] result.append((id2row[i], sims_to_matrix[i, 0])) log.print_info(logger, 1, "\nGetting neighbours of:%s" % (word)) log.print_name(logger, similarity, 1, "Similarity:") log.print_time_info(logger, time.time(), start, 2) return result
def get_rows(self, words): """ Returns the sub-matrix corresponding to a list of words. Args: words: list of strings Returns: Matrix type (of shape (len(words), no_cols)), the sub-matrix containing the words given as an input. Raises: KeyError: if one of words is not found in the space """ assert_is_instance(words, list) row_ids = [] for word in words: row_ids.append(self.row2id[word]) return self.cooccurrence_matrix[row_ids, :]
def get_rows(self, words): """ Returns the sub-matrix corresponding to a list of words. Args: words: list of strings Returns: Matrix type (of shape (len(words), no_cols)), the sub-matrix containing the words given as an input. Raises: KeyError: if one of words is not found in the space """ assert_is_instance(words, list) row_ids = [] for word in words: row_ids.append(self.row2id[word]) return self.cooccurrence_matrix[row_ids,:]
def extract_arg_spaces(cls, arg_space): """ TO BE MOVED TO A UTILS MODULE! """ if not isinstance(arg_space, tuple): arg1_space = arg_space arg2_space = arg_space else: if len(arg_space) != 2: raise ValueError("expected two spaces, received %d-ary tuple " % len(arg_space)) arg1_space, arg2_space = arg_space assert_is_instance(arg1_space, Space) assert_is_instance(arg2_space, Space) cls._assert_space_match(arg1_space, arg2_space) return arg1_space, arg2_space
def get_sim(self, word1, word2, similarity, space2=None): """ Computes the similarity between two targets in the semantic space. If one of the two targets to be compared is not found, it returns 0.. Args: word1: string word2: string similarity: of type Similarity, the similarity measure to be used space2: Space type, Optional. If provided, word2 is interpreted in this space, rather than the current space. Default, both words are interpreted in the current space. Returns: scalar, similarity score """ assert_is_instance(similarity, Similarity) try: v1 = self.get_row(word1) except KeyError: print("Row string %s not found, returning 0.0" % (word1)) return 0.0 try: if space2 is None: v2 = self.get_row(word2) else: v2 = space2.get_row(word2) except KeyError: print("Row string %s not found, returning 0.0" % (word2)) return 0.0 [v1, v2] = resolve_type_conflict([v1, v2], DenseMatrix) return similarity.get_sim(v1, v2)
def get_sim(self, word1, word2, similarity, space2=None): """ Computes the similarity between two targets in the semantic space. If one of the two targets to be compared is not found, it returns 0.. Args: word1: string word2: string similarity: of type Similarity, the similarity measure to be used space2: Space type, Optional. If provided, word2 is interpreted in this space, rather than the current space. Default, both words are interpreted in the current space. Returns: scalar, similarity score """ assert_is_instance(similarity, Similarity) try: v1 = self.get_row(word1) except KeyError: print "Row string %s not found, returning 0.0" % (word1) return 0.0 try: if space2 is None: v2 = self.get_row(word2) else: v2 = space2.get_row(word2) except KeyError: print "Row string %s not found, returning 0.0" % (word2) return 0.0 [v1, v2] = resolve_type_conflict([v1, v2], DenseMatrix) return similarity.get_sim(v1, v2)
def __init__(self, core_space, matrix_, id2row, row2id=None): """ Constructor. Args: core_space: Space type, the core space that this is peripheral to. matrix_: Matrix type, the data matrix of the space id2row: list, the row elements row2id: dictionary, maps row strings to ids. Optional, built from id2row by default. Returns: A peripheral semantic space (type PeripheralSpace) on which the core space operations have been projected. Column indexing structures and operations are taken over from the core space. Raises: TypeError: if matrix_ or core_space are not of the correct type ValueError: if element shape is not consistent with the size of matrix rows if the matrix and the provided row and column indexing structures are not of consistent shapes. """ assert_is_instance(matrix_, Matrix) assert_is_instance(core_space, Space) assert_is_instance(id2row, list) # TODO: assert it is not a peripheral space here! if row2id is None: row2id = list2dict(id2row) else: assert_dict_match_list(row2id, id2row) column2id = core_space.column2id id2column = core_space.id2column self._operations = list(core_space.operations) self._row2id = row2id self._id2row = id2row self._column2id = column2id self._id2column = id2column self._cooccurrence_matrix = self._project_core_operations(matrix_) assert_shape_consistent(self.cooccurrence_matrix, self._id2row, self._id2column, self._row2id, self._column2id) self._element_shape = (self._cooccurrence_matrix.shape[1],)
def set_regression_learner(self, regression_learner): assert_is_instance(regression_learner, RegressionLearner) self._regression_learner = regression_learner
def set_cooccurrence_matrix(self, matrix_): assert_is_instance(matrix_, Matrix) assert_shape_consistent(matrix_, self.row2id, self.id2row, self.column2id, self.id2column) self._cooccurrence_matrix = matrix_
def __init__(self, matrix_, id2row, id2column, row2id=None, column2id=None, **kwargs): """ Constructor. Args: matrix_: Matrix type, the data matrix of the space id2row: list, the row elements id2column: list, the column elements row2id: dictionary, maps row strings to ids. Optional, built from id2row by default. column2id: dictionary, maps col strings to ids. Optional, built from id2column by default operations: list of operations already performed on the input matrix, Optional, by default set to empty. element_shape: tuple of int, the shape on row elements. Optional, by default row elements are one-dimensional and element_shape is (no_cols, ). Used in 3D composition. Returns: A semantic space (type Space) Raises: TypeError: if matrix_ is not of the correct type ValueError: if element shape is not consistent with the size of matrix rows if the matrix and the provided row and column indexing structures are not of consistent shapes. """ assert_is_instance(matrix_, Matrix) assert_valid_kwargs(kwargs, ["operations", "element_shape"]) assert_is_instance(id2row, list) assert_is_instance(id2column, list) if row2id is None: row2id = list2dict(id2row) else: assert_dict_match_list(row2id, id2row) if column2id is None: column2id = list2dict(id2column) else: assert_dict_match_list(column2id, id2column) assert_shape_consistent(matrix_, id2row, id2column, row2id, column2id) self._cooccurrence_matrix = matrix_ self._row2id = row2id self._id2row = id2row self._column2id = column2id self._id2column = id2column if "operations" in kwargs: self._operations = kwargs["operations"] else: self._operations = [] if "element_shape" in kwargs: elem_shape = kwargs["element_shape"] if prod(elem_shape) != self._cooccurrence_matrix.shape[1]: raise ValueError("Trying to assign invalid element shape:\ element_shape: %s, matrix columns: %s" % (str(elem_shape), str(self._cooccurrence_matrix.shape[1]))) # NOTE: watch out here, can cause bugs, if we change the dimension # of a regular space and we do not create a new space self._element_shape = kwargs["element_shape"] else: self._element_shape = (self._cooccurrence_matrix.shape[1],)
def __init__(self, matrix_, id2row, id2column, row2id=None, column2id=None, operations=[], element_shape=None): """ Constructor. Args: matrix_: Matrix type, the data matrix of the space id2row: list, the row elements id2column: list, the column elements row2id: dictionary, maps row strings to ids. Optional, built from id2row by default. column2id: dictionary, maps col strings to ids. Optional, built from id2column by default operations: list of operations already performed on the input matrix, Optional, by default set to empty. element_shape: tuple of int, the shape on row elements. Optional, by default row elements are one-dimensional and element_shape is (no_cols, ). Used in 3D composition. Returns: A semantic space (type Space) Raises: TypeError: if matrix_ is not of the correct type ValueError: if element shape is not consistent with the size of matrix rows if the matrix and the provided row and column indexing structures are not of consistent shapes. """ assert_is_instance(matrix_, Matrix) assert_is_instance(id2row, list) assert_is_instance(id2column, list) if row2id is None: row2id = list2dict(id2row) else: assert_dict_match_list(row2id, id2row) if column2id is None: column2id = list2dict(id2column) else: assert_dict_match_list(column2id, id2column) assert_shape_consistent(matrix_, id2row, id2column, row2id, column2id) self._cooccurrence_matrix = matrix_ self._row2id = row2id self._id2row = id2row self._column2id = column2id self._id2column = id2column self._operations = operations if element_shape: if prod(element_shape) != self._cooccurrence_matrix.shape[1]: raise ValueError("Trying to assign invalid element shape:\ element_shape: %s, matrix columns: %s" % (str(element_shape), str(self._cooccurrence_matrix.shape[1]))) # NOTE: watch out here, can cause bugs, if we change the dimension # of a regular space and we do not create a new space self._element_shape = element_shape else: self._element_shape = (self._cooccurrence_matrix.shape[1], )