Пример #1
0
    def __init__(self,
                 X,
                 mX,
                 y,
                 term_idx_store,
                 category_idx_store,
                 metadata_idx_store,
                 unigram_frequency_path=None):
        '''

        Parameters
        ----------
        X : csr_matrix
            term document matrix
        mX : csr_matrix
            metadata-document matrix
        y : np.array
            category index array
        term_idx_store : IndexStore
            Term indices
        category_idx_store : IndexStore
            Catgory indices
        metadata_idx : IndexStore
          Document metadata indices
        unigram_frequency_path : str or None
            Path to term frequency file.
        '''
        TermDocMatrixWithoutCategories.__init__(self, X=X, mX=mX, term_idx_store=term_idx_store,
                                                metadata_idx_store=metadata_idx_store,
                                                unigram_frequency_path=unigram_frequency_path)
        self._y = y
        self._category_idx_store = category_idx_store
 def _apply_pipeline_and_get_build_instance(self,
                                            X_factory,
                                            mX_factory,
                                            df,
                                            parse_pipeline,
                                            term_idx_store,
                                            metadata_idx_store):
     df.apply(parse_pipeline.parse, axis=1)
     X, mX = build_sparse_matrices_with_num_docs(len(df), X_factory, mX_factory)
     tdm = TermDocMatrixWithoutCategories(X, mX, term_idx_store, metadata_idx_store)
     return tdm
Пример #3
0
    def __init__(self,
                 X,
                 mX,
                 y,
                 term_idx_store,
                 category_idx_store,
                 metadata_idx_store,
                 unigram_frequency_path=None):
        '''

        Parameters
        ----------
        X : csr_matrix
            term document matrix
        mX : csr_matrix
            metadata-document matrix
        y : np.array
            category index array
        term_idx_store : IndexStore
            Term indices
        category_idx_store : IndexStore
            Catgory indices
        metadata_idx : IndexStore
          Document metadata indices
        unigram_frequency_path : str or None
            Path to term frequency file.
        '''
        if all(y == y[0]):
            raise CannotCreateATermDocMatrixWithASignleCategoryException(
                'Documents must be labeled with more than one category. All documents were labeled '
                'with category: "' + str(category_idx_store.getval(y[0])) +
                '"')
        TermDocMatrixWithoutCategories.__init__(
            self,
            X=X,
            mX=mX,
            term_idx_store=term_idx_store,
            metadata_idx_store=metadata_idx_store,
            unigram_frequency_path=unigram_frequency_path)
        self._y = y
        self._category_idx_store = category_idx_store