def test_SplitCorpus(self): odd = split_corpus(self.corpus.corpus, [1, 3, 5]) even = split_corpus(self.corpus.corpus, [2, 4, 6]) odd_expected = [np.array([0]), np.array([1, 0]), np.array([3, 0]), np.array([2])] even_expected = [np.array([0, 1]), np.array([0, 3]), np.array([0, 2])] for i in xrange(len(odd)): np.testing.assert_array_equal(odd[i], odd_expected[i]) for i in xrange(len(even)): np.testing.assert_array_equal(even[i], even_expected[i])
def test_SplitCorpus(self): odd = split_corpus(self.corpus.corpus, [1,3,5]) even = split_corpus(self.corpus.corpus, [2,4,6]) odd_expected = [np.array([0]), np.array([1, 0]), np.array([3, 0]), np.array([2])] even_expected = [np.array([0, 1]), np.array([0, 3]), np.array([0, 2])] for i in range(len(odd)): np.testing.assert_array_equal(odd[i], odd_expected[i]) for i in range(len(even)): np.testing.assert_array_equal(even[i], even_expected[i])
def view_contexts(self, ctx_type, as_slices=False, as_indices=False): """ Displays a tokenization of the corpus. :param ctx_type: The type of a tokenization. :type ctx_type: string-like :param as_slices: If True, a list of slices corresponding to 'ctx_type' is returned. Otherwise, integer representations are returned. Default is `False`. :type as_slices: Boolean, optional :Returns: A tokenized view of `corpus`. :See Also: :class:`BaseCorpus`, :meth:`numpy.split` """ indices = self.view_metadata(ctx_type)["idx"] if as_indices: return indices if as_slices: if len(indices) == 0: return [slice(0, 0)] slices = [] slices.append(slice(0, indices[0])) for i in xrange(len(indices) - 1): slices.append(slice(indices[i], indices[i + 1])) return slices return split_corpus(self.corpus, indices)
def view_contexts(self, ctx_type, as_slices=False, as_indices=False): """ Displays a tokenization of the corpus. :param ctx_type: The type of a tokenization. :type ctx_type: string-like :param as_slices: If True, a list of slices corresponding to 'ctx_type' is returned. Otherwise, integer representations are returned. Default is `False`. :type as_slices: Boolean, optional :Returns: A tokenized view of `corpus`. :See Also: :class:`BaseCorpus`, :meth:`numpy.split` """ indices = self.view_metadata(ctx_type)['idx'] if as_indices: return indices if as_slices: if len(indices) == 0: return [slice(0, 0)] slices = [] slices.append(slice(0, indices[0])) for i in range(len(indices) - 1): slices.append(slice(indices[i], indices[i + 1])) return slices return split_corpus(self.corpus, indices)
def word_topics(self, word, as_strings=True): """ Searches for every occurrence of `word` in the entire corpus and returns a list each row of which contains the name or ID number of document, the relative position in the document, and the assigned topic number for each occurrence of `word`. :param word: The word for which the search is performed. :type word: string :param as_strings: If `True`, returns document names rather than ID numbers. Default is `True`. :type as_strings: boolean, optional :returns: an instance of :class:`LabeledColumn`. A structured array consisting of three columns. Each column is a list of: (1) name/ID of document containing `word` (2) relative position of `word` in the document (3) Topic number assigned to the token. """ w, word = self._res_word_type(word) # Search for occurrences of a word in the corpus and return a # positions and topic assignments for each found ct = self.model.context_type contexts = self.corpus.view_contexts(ct) idx = [(contexts[d] == w) for d in xrange(len(contexts))] Z = split_corpus(self.model.Z, self.model.indices) Z_w = [(d, i, t) for d in xrange(len(Z)) for i,t in enumerate(Z[d]) if idx[d][i]] # Label data if as_strings: docs = self.corpus.view_metadata(ct)[self._doc_label_name] dt = [('doc', docs.dtype), ('pos',np.int), ('value', np.int)] Z_w = [(docs[d], i, t) for (d, i, t) in Z_w] else: dt = [('i', np.int), ('pos',np.int), ('value', np.int)] Z_w = np.array(Z_w, dtype=dt).view(LabeledColumn) Z_w.col_header = 'Word: ' + word Z_w.subcol_headers = ['Document', 'Pos', 'Topic'] return Z_w
def word_topics(self, word, as_strings=True): """ Searches for every occurrence of `word` in the entire corpus and returns a list each row of which contains the name or ID number of document, the relative position in the document, and the assigned topic number for each occurrence of `word`. :param word: The word for which the search is performed. :type word: string :param as_strings: If `True`, returns document names rather than ID numbers. Default is `True`. :type as_strings: boolean, optional :returns: an instance of :class:`LabeledColumn`. A structured array consisting of three columns. Each column is a list of: (1) name/ID of document containing `word` (2) relative position of `word` in the document (3) Topic number assigned to the token. """ w, word = self._res_word_type(word) # Search for occurrences of a word in the corpus and return a # positions and topic assignments for each found ct = self.model.context_type contexts = self.corpus.view_contexts(ct) idx = [(contexts[d] == w) for d in range(len(contexts))] Z = split_corpus(self.model.Z, self.model.indices) Z_w = [(d, i, t) for d in range(len(Z)) for i, t in enumerate(Z[d]) if idx[d][i]] # Label data if as_strings: docs = self.corpus.view_metadata(ct)[self._doc_label_name] dt = [('doc', docs.dtype), ('pos', np.int), ('value', np.int)] Z_w = [(docs[d], i, t) for (d, i, t) in Z_w] else: dt = [('i', np.int), ('pos', np.int), ('value', np.int)] Z_w = np.array(Z_w, dtype=dt).view(LabeledColumn) Z_w.col_header = 'Word: ' + word Z_w.subcol_headers = ['Document', 'Pos', 'Topic'] return Z_w
def docs(self): return split_corpus(self.corpus, self.indices)
def Z_split(self): return split_corpus(self.Z, self.indices)
def Z_split(self): #print self.indices return split_corpus(self.Z, self.indices)