def svd(matrix_, reduced_dimension): """ Performs SVD decomposition. If the rank is smaller than the requested reduced dimension, reduction to rank is performed. Dense SVD uses Linalg._SVD_TOL to decide the rank of the matrix. Args: matrix_: input of type Matrix reduced_dimension: int, the desired reduced dimension Returns: U,S,V of the decomposition X = USV^T. U, V: Matrix type, S: ndarray of singular values. """ log.print_info(logger, 4, "In SVD..reducing to dim %d" % reduced_dimension) log.print_matrix_info(logger, matrix_, 5, "Input matrix:") #TODO: IMPORTANT!! do the sign normalization COLUMN-wise!!!not #for the full matrix at once!! if reduced_dimension == 0: raise ValueError("Cannot reduce to dimensionality 0.") if isinstance(matrix_, SparseMatrix): result = Linalg._sparse_svd(matrix_, reduced_dimension) elif isinstance(matrix_, DenseMatrix): result = Linalg._dense_svd(matrix_, reduced_dimension) else: raise TypeError("expected Matrix type, received %s" % type(matrix_)) log.print_matrix_info(logger, result[0], 5, "Resulting matrix U:") return result
def compose(self, data, arg_space): """ Uses a composition model to compose elements. Args: data: data to be composed. List of tuples, each containing 3 strings: (arg1, arg2, composed_phrase). arg1 and arg2 are the elements to be composed and composed_phrase is the string associated to their composition. arg_space: argument space(s). Space object or a tuple of two Space objects (e.g. my_space, or (my_space1, my_space2)). If two spaces are provided, arg1 elements of data are interpreted in space1, and arg2 in space2. Returns: composed space: a new object of type Space, containing the phrases obtained through composition. """ start = time.time() arg1_space, arg2_space = self.extract_arg_spaces(arg_space) arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(data, (arg1_space.row2id, arg2_space.row2id, None)) # we try to achieve at most MAX_MEM_OVERHEAD*phrase_space memory overhead # the /3.0 is needed # because the composing data needs 3 * len(train_data) memory (arg1 vector, arg2 vector, phrase vector) chunk_size = int(max(arg1_space.cooccurrence_matrix.shape[0],arg2_space.cooccurrence_matrix.shape[0],len(phrase_list)) * self.MAX_MEM_OVERHEAD / 3.0) + 1 composed_mats = [] for i in range(int(math.ceil(len(arg1_list) / float(chunk_size)))): beg, end = i*chunk_size, min((i+1)*chunk_size, len(arg1_list)) arg1_mat = arg1_space.get_rows(arg1_list[beg:end]) arg2_mat = arg2_space.get_rows(arg2_list[beg:end]) [arg1_mat, arg2_mat] = resolve_type_conflict([arg1_mat, arg2_mat], DenseMatrix) composed_mat = self._compose(arg1_mat, arg2_mat) composed_mats.append(composed_mat) composed_phrase_mat = composed_mat.nary_vstack(composed_mats) if self.composed_id2column is None: self.composed_id2column = self._build_id2column(arg1_space, arg2_space) log.print_name(logger, self, 1, "\nComposed with composition model:") log.print_info(logger, 3, "Composed total data points:%s" % arg1_mat.shape[0]) log.print_matrix_info(logger, composed_phrase_mat, 4, "Resulted (composed) semantic space::") log.print_time_info(logger, time.time(), start, 2) return Space(composed_phrase_mat, phrase_list, self.composed_id2column)
def compose(self, data, arg_space): """ Uses a lexical function composition model to compose elements. Args: data: data to be composed. List of tuples, each containing 3 strings: (function_word, arg, composed_phrase). function_word and arg are the elements to be composed and composed_phrase is the string associated to their composition. function_word elements are interpreted in self.function_space. arg_space: argument space, of type Space. arg elements of data are interpreted in this space. Returns: composed space: a new object of type Space, containing the phrases obtained through composition. """ start = time.time() assert_is_instance(arg_space, Space) arg1_list, arg2_list, phrase_list = self.valid_data_to_lists( data, (self._function_space.row2id, arg_space.row2id, None)) composed_vec_list = [] for i in range(len(arg1_list)): arg1_vec = self._function_space.get_row(arg1_list[i]) arg2_vec = arg_space.get_row(arg2_list[i]) matrix_type = get_type_of_largest([arg1_vec, arg2_vec]) [arg1_vec, arg2_vec] = resolve_type_conflict([arg1_vec, arg2_vec], matrix_type) composed_ph_vec = self._compose(arg1_vec, arg2_vec, self._function_space.element_shape) composed_vec_list.append(composed_ph_vec) result_element_shape = self._function_space.element_shape[0:-1] composed_ph_mat = composed_ph_vec.nary_vstack(composed_vec_list) log.print_name(logger, self, 1, "\nComposed with composition model:") log.print_info(logger, 3, "Composed total data points:%s" % len(arg1_list)) log.print_info( logger, 3, "Functional shape of the resulted (composed) elements:%s" % (result_element_shape, )) log.print_matrix_info(logger, composed_ph_mat, 4, "Resulted (composed) semantic space:") log.print_time_info(logger, time.time(), start, 2) return Space(composed_ph_mat, phrase_list, self.composed_id2column, element_shape=result_element_shape)
def compose(self, data, arg_space): """ Uses a lexical function composition model to compose elements. Args: data: data to be composed. List of tuples, each containing 3 strings: (function_word, arg, composed_phrase). function_word and arg are the elements to be composed and composed_phrase is the string associated to their composition. function_word elements are interpreted in self.function_space. arg_space: argument space, of type Space. arg elements of data are interpreted in this space. Returns: composed space: a new object of type Space, containing the phrases obtained through composition. """ start = time.time() assert_is_instance(arg_space, Space) arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(data, (self._function_space.row2id, arg_space.row2id, None)) composed_vec_list = [] for i in xrange(len(arg1_list)): arg1_vec = self._function_space.get_row(arg1_list[i]) arg2_vec = arg_space.get_row(arg2_list[i]) matrix_type = get_type_of_largest([arg1_vec, arg2_vec]) [arg1_vec, arg2_vec] = resolve_type_conflict([arg1_vec, arg2_vec], matrix_type) composed_ph_vec = self._compose(arg1_vec, arg2_vec, self._function_space.element_shape) composed_vec_list.append(composed_ph_vec) result_element_shape = self._function_space.element_shape[0:-1] composed_ph_mat = composed_ph_vec.nary_vstack(composed_vec_list) log.print_name(logger, self, 1, "\nComposed with composition model:") log.print_info(logger, 3, "Composed total data points:%s" % len(arg1_list)) log.print_info(logger, 3, "Functional shape of the resulted (composed) elements:%s" % (result_element_shape,)) log.print_matrix_info(logger, composed_ph_mat, 4, "Resulted (composed) semantic space:") log.print_time_info(logger, time.time(), start, 2) return Space(composed_ph_mat, phrase_list, self.composed_id2column, element_shape = result_element_shape)
def get_neighbours(self, word, no_neighbours, similarity, space2=None): """ Computes the neighbours of a word in the semantic space. Args: word: string, target word no_neighbours: int, the number of neighbours desired similarity: of type Similarity, the similarity measure to be used space2: Space type, Optional. If provided, the neighbours are retrieved from this space, rather than the current space. Default, neighbours are retrieved from the current space. Returns: list of (neighbour_string, similarity_value) tuples. Raises: KeyError: if the word is not found in the semantic space. """ start = time.time() assert_is_instance(similarity, Similarity) vector = self.get_row(word) if space2 is None: id2row = self.id2row sims_to_matrix = similarity.get_sims_to_matrix(vector, self.cooccurrence_matrix) else: mat_type = type(space2.cooccurrence_matrix) if not isinstance(vector, mat_type): vector = mat_type(vector) sims_to_matrix = similarity.get_sims_to_matrix(vector, space2.cooccurrence_matrix) id2row = space2.id2row sorted_perm = sims_to_matrix.sorted_permutation(sims_to_matrix.sum, 1) no_neighbours = min(no_neighbours, len(id2row)) result = [] for count in range(no_neighbours): i = sorted_perm[count] result.append((id2row[i], sims_to_matrix[i,0])) log.print_info(logger, 1, "\nGetting neighbours of:%s" % (word)) log.print_name(logger, similarity, 1, "Similarity:") log.print_time_info(logger, time.time(), start, 2) return result
def vstack(cls, space1, space2): """ Classmethod. Stacks two semantic spaces. The rows in the two spaces are concatenated. Args: space1, space2: spaces to be stacked, of type Space Returns: Stacked space, type Space. Raises: ValueError: if the spaces have different number of columns or their columns are not identical """ if space1.cooccurrence_matrix.shape[ 1] != space2.cooccurrence_matrix.shape[1]: raise ValueError("Inconsistent shapes: %s, %s" % (space1.cooccurrence_matrix.shape[1], space2.cooccurrence_matrix.shape[1])) if space1.id2column != space2.id2column: raise ValueError("Identical columns required") new_row2id = add_items_to_dict(space1.row2id.copy(), space2.id2row) new_id2row = space1.id2row + space2.id2row matrix_type = get_type_of_largest( [space1.cooccurrence_matrix, space2.cooccurrence_matrix]) [new_mat1, new_mat2] = resolve_type_conflict( [space1.cooccurrence_matrix, space2.cooccurrence_matrix], matrix_type) new_mat = new_mat1.vstack(new_mat2) log.print_info(logger, 1, "\nVertical stack of two spaces") log.print_matrix_info(logger, space1.cooccurrence_matrix, 2, "Semantic space 1:") log.print_matrix_info(logger, space2.cooccurrence_matrix, 2, "Semantic space 2:") log.print_matrix_info(logger, new_mat, 2, "Resulted semantic space:") return Space(new_mat, new_id2row, list(space1.id2column), new_row2id, space1.column2id.copy(), operations=[])
def get_neighbours(self, word, no_neighbours, similarity, space2=None): """ Computes the neighbours of a word in the semantic space. Args: word: string, target word no_neighbours: int, the number of neighbours desired similarity: of type Similarity, the similarity measure to be used space2: Space type, Optional. If provided, the neighbours are retrieved from this space, rather than the current space. Default, neighbours are retrieved from the current space. Returns: list of (neighbour_string, similarity_value) tuples. Raises: KeyError: if the word is not found in the semantic space. """ start = time.time() assert_is_instance(similarity, Similarity) vector = self.get_row(word) if space2 is None: id2row = self.id2row sims_to_matrix = similarity.get_sims_to_matrix( vector, self.cooccurrence_matrix) else: mat_type = type(space2.cooccurrence_matrix) if not isinstance(vector, mat_type): vector = mat_type(vector) sims_to_matrix = similarity.get_sims_to_matrix( vector, space2.cooccurrence_matrix) id2row = space2.id2row sorted_perm = sims_to_matrix.sorted_permutation(sims_to_matrix.sum, 1) no_neighbours = min(no_neighbours, len(id2row)) result = [] for count in range(no_neighbours): i = sorted_perm[count] result.append((id2row[i], sims_to_matrix[i, 0])) log.print_info(logger, 1, "\nGetting neighbours of:%s" % (word)) log.print_name(logger, similarity, 1, "Similarity:") log.print_time_info(logger, time.time(), start, 2) return result
def train(self, train_data, arg_space, phrase_space): """ Trains a composition model and sets its learned parameters. Args: train_data: list of string tuples. Each tuple contains 3 string elements: (arg1, arg2, phrase). arg_space: argument space(s). Space object or a tuple of two Space objects (e.g. my_space, or (my_space1, my_space2)). If two spaces are provided, arg1 elements of train data are interpreted in space1, and arg2 in space2. phrase space: phrase space, of type Space. Calls the specific training routine of the current composition model. Training tuples which contain strings not found in their respective spaces are ignored. The id2column attribute of the resulted composed space is set to be equal to that of the phrase space given as an input. """ start = time.time() arg1_space, arg2_space = self.extract_arg_spaces(arg_space) arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(train_data, (arg1_space.row2id, arg2_space.row2id, phrase_space.row2id) ) self._train(arg1_space, arg2_space, phrase_space, arg1_list, arg2_list, phrase_list) self.composed_id2column = phrase_space.id2column log.print_composition_model_info(logger, self, 1, "\nTrained composition model:") log.print_info(logger, 2, "With total data points:%s" % len(arg1_list)) log.print_matrix_info(logger, arg1_space.cooccurrence_matrix, 3, "Semantic space of argument 1:") log.print_matrix_info(logger, arg2_space.cooccurrence_matrix, 3, "Semantic space of argument 2:") log.print_matrix_info(logger, phrase_space.cooccurrence_matrix, 3, "Semantic space of phrases:") log.print_time_info(logger, time.time(), start, 2)
def vstack(cls, space1, space2): """ Classmethod. Stacks two semantic spaces. The rows in the two spaces are concatenated. Args: space1, space2: spaces to be stacked, of type Space Returns: Stacked space, type Space. Raises: ValueError: if the spaces have different number of columns or their columns are not identical """ if space1.cooccurrence_matrix.shape[1] != space2.cooccurrence_matrix.shape[1]: raise ValueError("Inconsistent shapes: %s, %s" % (space1.cooccurrence_matrix.shape[1], space2.cooccurrence_matrix.shape[1])) if space1.id2column != space2.id2column: raise ValueError("Identical columns required") new_row2id = add_items_to_dict(space1.row2id.copy(), space2.id2row) new_id2row = space1.id2row + space2.id2row matrix_type = get_type_of_largest([space1.cooccurrence_matrix, space2.cooccurrence_matrix]) [new_mat1, new_mat2] = resolve_type_conflict([space1.cooccurrence_matrix, space2.cooccurrence_matrix], matrix_type) new_mat = new_mat1.vstack(new_mat2) log.print_info(logger, 1, "\nVertical stack of two spaces") log.print_matrix_info(logger, space1.cooccurrence_matrix, 2, "Semantic space 1:") log.print_matrix_info(logger, space2.cooccurrence_matrix, 2, "Semantic space 2:") log.print_matrix_info(logger, new_mat, 2, "Resulted semantic space:") return Space(new_mat, new_id2row, list(space1.id2column), new_row2id, space1.column2id.copy(), operations=[])
def compose(self, data, arg_space): """ Uses a composition model to compose elements. Args: data: data to be composed. List of tuples, each containing 3 strings: (arg1, arg2, composed_phrase). arg1 and arg2 are the elements to be composed and composed_phrase is the string associated to their composition. arg_space: argument space(s). Space object or a tuple of two Space objects (e.g. my_space, or (my_space1, my_space2)). If two spaces are provided, arg1 elements of data are interpreted in space1, and arg2 in space2. Returns: composed space: a new object of type Space, containing the phrases obtained through composition. """ start = time.time() arg1_space, arg2_space = self.extract_arg_spaces(arg_space) arg1_list, arg2_list, phrase_list = self.valid_data_to_lists( data, (arg1_space.row2id, arg2_space.row2id, None)) arg1_mat = arg1_space.get_rows(arg1_list) arg2_mat = arg2_space.get_rows(arg2_list) [arg1_mat, arg2_mat] = resolve_type_conflict([arg1_mat, arg2_mat], DenseMatrix) composed_phrase_mat = self._compose(arg1_mat, arg2_mat) if self.composed_id2column is None: self.composed_id2column = self._build_id2column( arg1_space, arg2_space) log.print_name(logger, self, 1, "\nComposed with composition model:") log.print_info(logger, 3, "Composed total data points:%s" % arg1_mat.shape[0]) log.print_matrix_info(logger, composed_phrase_mat, 4, "Resulted (composed) semantic space::") log.print_time_info(logger, time.time(), start, 2) return Space(composed_phrase_mat, phrase_list, self.composed_id2column)
def compose(self, data, arg_space): """ Uses a composition model to compose elements. Args: data: data to be composed. List of tuples, each containing 3 strings: (arg1, arg2, composed_phrase). arg1 and arg2 are the elements to be composed and composed_phrase is the string associated to their composition. arg_space: argument space(s). Space object or a tuple of two Space objects (e.g. my_space, or (my_space1, my_space2)). If two spaces are provided, arg1 elements of data are interpreted in space1, and arg2 in space2. Returns: composed space: a new object of type Space, containing the phrases obtained through composition. """ start = time.time() arg1_space, arg2_space = self.extract_arg_spaces(arg_space) arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(data, (arg1_space.row2id, arg2_space.row2id, None)) arg1_mat = arg1_space.get_rows(arg1_list) arg2_mat = arg2_space.get_rows(arg2_list) [arg1_mat, arg2_mat] = resolve_type_conflict([arg1_mat, arg2_mat], DenseMatrix) composed_phrase_mat = self._compose(arg1_mat, arg2_mat) if self.composed_id2column is None: self.composed_id2column = self._build_id2column(arg1_space, arg2_space) log.print_name(logger, self, 1, "\nComposed with composition model:") log.print_info(logger, 3, "Composed total data points:%s" % arg1_mat.shape[0]) log.print_matrix_info(logger, composed_phrase_mat, 4, "Resulted (composed) semantic space::") log.print_time_info(logger, time.time(), start, 2) return Space(composed_phrase_mat, phrase_list, self.composed_id2column)
def nmf(v, w_init, h_init): """ Performs Non-negative Matrix Factorization. It solves the problem: :math:`W,H = argmin(||X - WH||_2)` such that W and H are non-negative matrices. Args: w_init: initial value for matrix W, type Matrix h_init: initial value for matrix H, type Matrix Returns: W, H <Matrix>: where W, H solve the NMF problem stated above. """ log.print_info(logger, 4, "In NMF..reducing to dim %d" % w_init.shape[1]) log.print_matrix_info(logger, w_init, 5, "W init matrix:") log.print_matrix_info(logger, h_init, 5, "H init matrix:") if not isinstance(v, Matrix): raise TypeError("expected Matrix type, received %s" % type(v)) w = w_init h = h_init init_time = time() wt = w.transpose() ht = h.transpose() vt = v.transpose() gradW = (w * (h * ht)) - (v * ht) gradH = ((wt * w) * h) - (wt * v) gradW_norm = gradW.norm() gradH_norm = gradH.norm() initgrad = sqrt(pow(gradW_norm, 2) + pow(gradH_norm, 2)) #print 'Init gradient norm %f' % initgrad tolW = max(Linalg._NMF_MIN_TOL, Linalg._NMF_TOL) * initgrad tolH = tolW #loop_time = init_time for iteration in xrange(1, Linalg._NMF_MAX_ITER): log.print_info(logger, 5, "Iteration: %d(%d)" % (iteration, Linalg._NMF_MAX_ITER)) if time() - init_time > Linalg._NMF_TIME_LIMIT: break w, gradW, iterW = Linalg._nmf_nlssubprob(vt, h.transpose(), h, w.transpose(), tolW, Linalg._NMF_MAX_ITER_SUBPROB) old_w = w w = w.transpose() gradW = gradW.transpose() if iterW == 1: tolW = Linalg._NMF_TOL_DECREASE_FACTOR * tolW h, gradH, iterH = Linalg._nmf_nlssubprob(v, w, old_w, h, tolH, Linalg._NMF_MAX_ITER_SUBPROB) if iterH == 1: tolH = Linalg._NMF_TOL_DECREASE_FACTOR * tolH log.print_matrix_info(logger, w, 5, "Return W matrix:") log.print_matrix_info(logger, h, 5, "Return H matrix:") return w, h
def nmf(v, w_init, h_init): """ Performs Non-negative Matrix Factorization. It solves the problem: :math:`W,H = argmin(||X - WH||_2)` such that W and H are non-negative matrices. Args: w_init: initial value for matrix W, type Matrix h_init: initial value for matrix H, type Matrix Returns: W, H <Matrix>: where W, H solve the NMF problem stated above. """ log.print_info(logger, 4, "In NMF..reducing to dim %d" % w_init.shape[1]) log.print_matrix_info(logger, w_init, 5, "W init matrix:") log.print_matrix_info(logger, h_init, 5, "H init matrix:") if not isinstance(v, Matrix): raise TypeError("expected Matrix type, received %s" % type(v)) w = w_init h = h_init init_time = time() wt = w.transpose() ht = h.transpose() vt = v.transpose() gradW = (w * (h * ht)) - (v * ht) gradH = ((wt * w) * h) - (wt * v) gradW_norm = gradW.norm() gradH_norm = gradH.norm() initgrad = sqrt(pow(gradW_norm, 2) + pow(gradH_norm, 2)) #print 'Init gradient norm %f' % initgrad tolW = max(Linalg._NMF_MIN_TOL, Linalg._NMF_TOL) * initgrad tolH = tolW #loop_time = init_time for iteration in xrange(1, Linalg._NMF_MAX_ITER): log.print_info( logger, 5, "Iteration: %d(%d)" % (iteration, Linalg._NMF_MAX_ITER)) if time() - init_time > Linalg._NMF_TIME_LIMIT: break w, gradW, iterW = Linalg._nmf_nlssubprob( vt, h.transpose(), h, w.transpose(), tolW, Linalg._NMF_MAX_ITER_SUBPROB) old_w = w w = w.transpose() gradW = gradW.transpose() if iterW == 1: tolW = Linalg._NMF_TOL_DECREASE_FACTOR * tolW h, gradH, iterH = Linalg._nmf_nlssubprob( v, w, old_w, h, tolH, Linalg._NMF_MAX_ITER_SUBPROB) if iterH == 1: tolH = Linalg._NMF_TOL_DECREASE_FACTOR * tolH log.print_matrix_info(logger, w, 5, "Return W matrix:") log.print_matrix_info(logger, h, 5, "Return H matrix:") return w, h
def train(self, train_data, arg_space, phrase_space): """ Trains a lexical function composition model to learn a function space and sets the function_space parameter. Args: train_data: list of string tuples. Each tuple contains 3 string elements: (function_word, arg, phrase). arg_space: argument space, of type Space. arg elements of train data are interpreted in this space. phrase space: phrase space, of type Space. phrase elements of the train data are interpreted in this space. Training tuples which contain strings not found in their respective spaces are ignored. Function words containing less than _MIN_SAMPLES training instances are ignored. For example, if _MIN_SAMPLES=2 and function word "red" occurs in only one phrase, "red" is ignored. The id2column attribute of the resulted composed space is set to be equal to that of the phrase space given as an input. """ start = time.time() self._has_intercept = self._regression_learner.has_intercept() if not isinstance(arg_space, Space): raise ValueError("expected one input spaces!") result_mats = [] train_data = sorted(train_data, key=lambda tup: tup[0]) function_word_list, arg_list, phrase_list = self.valid_data_to_lists(train_data, (None, arg_space.row2id, phrase_space.row2id)) #partitions the sorted input data keys, key_ranges = get_partitions(function_word_list, self._MIN_SAMPLES) if not keys: raise ValueError("No valid training data found!") assert(len(arg_space.element_shape) == 1) if self._has_intercept: new_element_shape = phrase_space.element_shape + (arg_space.element_shape[0] + 1,) else: new_element_shape = phrase_space.element_shape + (arg_space.element_shape[0],) for i in xrange(len(key_ranges)): idx_beg, idx_end = key_ranges[i] print ("Training lexical function...%s with %d samples" % (keys[i], idx_end - idx_beg)) arg_mat = arg_space.get_rows(arg_list[idx_beg:idx_end]) phrase_mat = phrase_space.get_rows(phrase_list[idx_beg:idx_end]) #convert them to the same type matrix_type = get_type_of_largest([arg_mat, phrase_mat]) [arg_mat, phrase_mat] = resolve_type_conflict([arg_mat, phrase_mat], matrix_type) result_mat = self._regression_learner.train(arg_mat, phrase_mat).transpose() result_mat.reshape((1, np.prod(new_element_shape))) result_mats.append(result_mat) new_space_mat = arg_mat.nary_vstack(result_mats) self.composed_id2column = phrase_space.id2column self._function_space = Space(new_space_mat, keys, [], element_shape=new_element_shape) log.print_composition_model_info(logger, self, 1, "\nTrained composition model:") log.print_info(logger, 3, "Trained: %s lexical functions" % len(keys)) log.print_info(logger, 3, "With total data points:%s" % len(function_word_list)) log.print_matrix_info(logger, arg_space.cooccurrence_matrix, 3, "Semantic space of arguments:") log.print_info(logger, 3, "Shape of lexical functions learned:%s" % (new_element_shape,)) log.print_matrix_info(logger, new_space_mat, 3, "Semantic space of lexical functions:") log.print_time_info(logger, time.time(), start, 2)
def train(self, train_data, arg_space, phrase_space): """ Trains a lexical function composition model to learn a function space and sets the function_space parameter. Args: train_data: list of string tuples. Each tuple contains 3 string elements: (function_word, arg, phrase). arg_space: argument space, of type Space. arg elements of train data are interpreted in this space. phrase space: phrase space, of type Space. phrase elements of the train data are interpreted in this space. Training tuples which contain strings not found in their respective spaces are ignored. Function words containing less than _MIN_SAMPLES training instances are ignored. For example, if _MIN_SAMPLES=2 and function word "red" occurs in only one phrase, "red" is ignored. The id2column attribute of the resulted composed space is set to be equal to that of the phrase space given as an input. """ start = time.time() self._has_intercept = self._regression_learner.has_intercept() if not isinstance(arg_space, Space): raise ValueError("expected one input spaces!") result_mats = [] train_data = sorted(train_data, key=lambda tup: tup[0]) function_word_list, arg_list, phrase_list = self.valid_data_to_lists( train_data, (None, arg_space.row2id, phrase_space.row2id)) #partitions the sorted input data keys, key_ranges = get_partitions(function_word_list, self._MIN_SAMPLES) if not keys: raise ValueError("No valid training data found!") assert (len(arg_space.element_shape) == 1) if self._has_intercept: new_element_shape = phrase_space.element_shape + ( arg_space.element_shape[0] + 1, ) else: new_element_shape = phrase_space.element_shape + ( arg_space.element_shape[0], ) for i in range(len(key_ranges)): idx_beg, idx_end = key_ranges[i] print(("Training lexical function...%s with %d samples" % (keys[i], idx_end - idx_beg))) arg_mat = arg_space.get_rows(arg_list[idx_beg:idx_end]) phrase_mat = phrase_space.get_rows(phrase_list[idx_beg:idx_end]) #convert them to the same type matrix_type = get_type_of_largest([arg_mat, phrase_mat]) [arg_mat, phrase_mat] = resolve_type_conflict([arg_mat, phrase_mat], matrix_type) result_mat = self._regression_learner.train( arg_mat, phrase_mat).transpose() result_mat.reshape((1, np.prod(new_element_shape))) result_mats.append(result_mat) new_space_mat = arg_mat.nary_vstack(result_mats) self.composed_id2column = phrase_space.id2column self._function_space = Space(new_space_mat, keys, [], element_shape=new_element_shape) log.print_composition_model_info(logger, self, 1, "\nTrained composition model:") log.print_info(logger, 3, "Trained: %s lexical functions" % len(keys)) log.print_info(logger, 3, "With total data points:%s" % len(function_word_list)) log.print_matrix_info(logger, arg_space.cooccurrence_matrix, 3, "Semantic space of arguments:") log.print_info( logger, 3, "Shape of lexical functions learned:%s" % (new_element_shape, )) log.print_matrix_info(logger, new_space_mat, 3, "Semantic space of lexical functions:") log.print_time_info(logger, time.time(), start, 2)