Пример #1
0
    def test_SplitCorpus(self):
        odd = split_corpus(self.corpus.corpus, [1, 3, 5])
        even = split_corpus(self.corpus.corpus, [2, 4, 6])

        odd_expected = [np.array([0]), np.array([1, 0]), np.array([3, 0]), np.array([2])]
        even_expected = [np.array([0, 1]), np.array([0, 3]), np.array([0, 2])]

        for i in xrange(len(odd)):
            np.testing.assert_array_equal(odd[i], odd_expected[i])
        for i in xrange(len(even)):
            np.testing.assert_array_equal(even[i], even_expected[i])
Пример #2
0
    def test_SplitCorpus(self): 
        odd = split_corpus(self.corpus.corpus, [1,3,5])
        even = split_corpus(self.corpus.corpus, [2,4,6])

        odd_expected = [np.array([0]), np.array([1, 0]),
                np.array([3, 0]), np.array([2])]
        even_expected = [np.array([0, 1]), np.array([0, 3]),
                np.array([0, 2])]

        for i in range(len(odd)):
            np.testing.assert_array_equal(odd[i], odd_expected[i])
        for i in range(len(even)):
            np.testing.assert_array_equal(even[i], even_expected[i])
Пример #3
0
    def view_contexts(self, ctx_type, as_slices=False, as_indices=False):
        """
        Displays a tokenization of the corpus.

        :param ctx_type: The type of a tokenization.
        :type ctx_type: string-like

        :param as_slices: If True, a list of slices corresponding to 
            'ctx_type' is returned. Otherwise, integer representations
            are returned. Default is `False`.
        :type as_slices: Boolean, optional

        :Returns: A tokenized view of `corpus`.

        :See Also: :class:`BaseCorpus`, :meth:`numpy.split`
        """
        indices = self.view_metadata(ctx_type)["idx"]

        if as_indices:
            return indices

        if as_slices:
            if len(indices) == 0:
                return [slice(0, 0)]

            slices = []
            slices.append(slice(0, indices[0]))
            for i in xrange(len(indices) - 1):
                slices.append(slice(indices[i], indices[i + 1]))
            return slices

        return split_corpus(self.corpus, indices)
Пример #4
0
    def view_contexts(self, ctx_type, as_slices=False, as_indices=False):
        """
        Displays a tokenization of the corpus.

        :param ctx_type: The type of a tokenization.
        :type ctx_type: string-like

        :param as_slices: If True, a list of slices corresponding to 
            'ctx_type' is returned. Otherwise, integer representations
            are returned. Default is `False`.
        :type as_slices: Boolean, optional

        :Returns: A tokenized view of `corpus`.

        :See Also: :class:`BaseCorpus`, :meth:`numpy.split`
        """
        indices = self.view_metadata(ctx_type)['idx']

        if as_indices:
            return indices

        if as_slices:
            if len(indices) == 0:
                return [slice(0, 0)]

            slices = []
            slices.append(slice(0, indices[0]))
            for i in range(len(indices) - 1):
                slices.append(slice(indices[i], indices[i + 1]))
            return slices

        return split_corpus(self.corpus, indices)
    def word_topics(self, word, as_strings=True):
        """
        Searches for every occurrence of `word` in the entire corpus and returns 
        a list each row of which contains the name or ID number of document, 
        the relative position in the document, and the assigned topic number 
        for each occurrence of `word`.
        
        :param word: The word for which the search is performed.  
        :type word: string

        :param as_strings: If `True`, returns document names rather than 
            ID numbers. Default is `True`.
        :type as_strings: boolean, optional

        :returns: an instance of :class:`LabeledColumn`.
            A structured array consisting of three columns. Each column 
            is a list of:
            (1) name/ID of document containing `word`
            (2) relative position of `word` in the document
            (3) Topic number assigned to the token.
        """
        w, word = self._res_word_type(word)

        # Search for occurrences of a word in the corpus and return a
        # positions and topic assignments for each found
        ct = self.model.context_type
        contexts = self.corpus.view_contexts(ct)
        idx = [(contexts[d] == w) for d in xrange(len(contexts))]
        Z = split_corpus(self.model.Z, self.model.indices)
        Z_w = [(d, i, t) for d in xrange(len(Z)) 
               for i,t in enumerate(Z[d]) if idx[d][i]]

        # Label data
        if as_strings:
            docs = self.corpus.view_metadata(ct)[self._doc_label_name]
            dt = [('doc', docs.dtype), ('pos',np.int), ('value', np.int)]
            Z_w = [(docs[d], i, t) for (d, i, t) in Z_w]
        else:
            dt = [('i', np.int), ('pos',np.int), ('value', np.int)]

        Z_w = np.array(Z_w, dtype=dt).view(LabeledColumn)
        Z_w.col_header = 'Word: ' + word
        Z_w.subcol_headers = ['Document', 'Pos', 'Topic']

        return Z_w
Пример #6
0
    def word_topics(self, word, as_strings=True):
        """
        Searches for every occurrence of `word` in the entire corpus and returns 
        a list each row of which contains the name or ID number of document, 
        the relative position in the document, and the assigned topic number 
        for each occurrence of `word`.
        
        :param word: The word for which the search is performed.  
        :type word: string

        :param as_strings: If `True`, returns document names rather than 
            ID numbers. Default is `True`.
        :type as_strings: boolean, optional

        :returns: an instance of :class:`LabeledColumn`.
            A structured array consisting of three columns. Each column 
            is a list of:
            (1) name/ID of document containing `word`
            (2) relative position of `word` in the document
            (3) Topic number assigned to the token.
        """
        w, word = self._res_word_type(word)

        # Search for occurrences of a word in the corpus and return a
        # positions and topic assignments for each found
        ct = self.model.context_type
        contexts = self.corpus.view_contexts(ct)
        idx = [(contexts[d] == w) for d in range(len(contexts))]
        Z = split_corpus(self.model.Z, self.model.indices)
        Z_w = [(d, i, t) for d in range(len(Z)) for i, t in enumerate(Z[d])
               if idx[d][i]]

        # Label data
        if as_strings:
            docs = self.corpus.view_metadata(ct)[self._doc_label_name]
            dt = [('doc', docs.dtype), ('pos', np.int), ('value', np.int)]
            Z_w = [(docs[d], i, t) for (d, i, t) in Z_w]
        else:
            dt = [('i', np.int), ('pos', np.int), ('value', np.int)]

        Z_w = np.array(Z_w, dtype=dt).view(LabeledColumn)
        Z_w.col_header = 'Word: ' + word
        Z_w.subcol_headers = ['Document', 'Pos', 'Topic']

        return Z_w
Пример #7
0
 def docs(self):
     return split_corpus(self.corpus, self.indices)
Пример #8
0
 def Z_split(self):
     return split_corpus(self.Z, self.indices)
Пример #9
0
 def docs(self):
     return split_corpus(self.corpus, self.indices)
Пример #10
0
 def Z_split(self):
     #print self.indices
     return split_corpus(self.Z, self.indices)
Пример #11
0
 def Z_split(self):
     return split_corpus(self.Z, self.indices)