def labeledMultiDocs2DocFreqMatrix( self, labeled_documents: AvailableInputTypes, is_use_cache: bool = False, is_use_memmap: bool = False, path_working_dir: str = None, ngram: int = 1, n_jobs: int = 1, joblib_backend: str = 'auto') -> DataCsrMatrix: """This function makes document-frequency matrix. Document-frequency matrix is scipy.csr_matrix. * Input object - "labeled_structure" is either of Dict object or shelve.DbfilenameShelf. The example format is below >>> {"label_a": [["I", "aa", "aa", "aa", "aa", "aa"],["bb", "aa", "aa", "aa", "aa", "aa"],["I", "aa", "hero", "some", "ok", "aa"]], >>> "label_b": [["bb", "bb", "bb"],["bb", "bb", "bb"],["hero", "ok", "bb"],["hero", "cc", "bb"],], >>> "label_c": [["cc", "cc", "cc"],["cc", "cc", "bb"],["xx", "xx", "cc"],["aa", "xx", "cc"],]} * Output - DataCsrMatrix object. """ self.__check_data_structure(labeled_documents) if ngram > 1: labeled_documents = ngram_constructor.ngram_constructor( labeled_documents=labeled_documents, ngram=ngram, n_jobs=n_jobs) logger.debug(msg='Now pre-processing before CSR matrix') # convert data structure set_document_information = labeledMultiDocs2labeledDocsSet.multiDocs2DocFreqInfo( labeled_documents, n_jobs=n_jobs) assert isinstance( set_document_information, labeledMultiDocs2labeledDocsSet.SetDocumentInformation) # count n(docs) per label n_docs_distribution = self.count_document_distribution( labeled_documents=labeled_documents, label2id=set_document_information.label2id) # count term-frequency per label term_frequency_distribution = self.count_term_frequency_distribution( labeled_documents=labeled_documents, label2id=set_document_information.label2id) return DataCsrMatrix( csr_matrix_=set_document_information.matrix_object, label2id_dict=set_document_information.label2id, vocabulary=set_document_information.feature2id, n_docs_distribution=n_docs_distribution, n_term_freq_distribution=term_frequency_distribution, is_use_cache=is_use_cache, is_use_memmap=is_use_memmap, path_working_dir=path_working_dir)
def labeledMultiDocs2DocFreqMatrix(self, labeled_documents:AvailableInputTypes, is_use_cache:bool=False, is_use_memmap:bool=False, path_working_dir:str=None, ngram:int=1, n_jobs:int=1, joblib_backend:str='auto')->DataCsrMatrix: """This function makes document-frequency matrix. Document-frequency matrix is scipy.csr_matrix. * Input object - "labeled_structure" is either of Dict object or shelve.DbfilenameShelf. The example format is below >>> {"label_a": [["I", "aa", "aa", "aa", "aa", "aa"],["bb", "aa", "aa", "aa", "aa", "aa"],["I", "aa", "hero", "some", "ok", "aa"]], >>> "label_b": [["bb", "bb", "bb"],["bb", "bb", "bb"],["hero", "ok", "bb"],["hero", "cc", "bb"],], >>> "label_c": [["cc", "cc", "cc"],["cc", "cc", "bb"],["xx", "xx", "cc"],["aa", "xx", "cc"],]} * Output - DataCsrMatrix object. """ self.__check_data_structure(labeled_documents) if ngram > 1: labeled_documents = ngram_constructor.ngram_constructor( labeled_documents=labeled_documents, ngram=ngram, n_jobs=n_jobs) logger.debug(msg='Now pre-processing before CSR matrix') # convert data structure set_document_information = labeledMultiDocs2labeledDocsSet.multiDocs2DocFreqInfo(labeled_documents, n_jobs=n_jobs) assert isinstance(set_document_information, labeledMultiDocs2labeledDocsSet.SetDocumentInformation) # count n(docs) per label n_docs_distribution = self.count_document_distribution( labeled_documents=labeled_documents, label2id=set_document_information.label2id ) # count term-frequency per label term_frequency_distribution = self.count_term_frequency_distribution( labeled_documents=labeled_documents, label2id=set_document_information.label2id ) return DataCsrMatrix( csr_matrix_=set_document_information.matrix_object, label2id_dict=set_document_information.label2id, vocabulary=set_document_information.feature2id, n_docs_distribution=n_docs_distribution, n_term_freq_distribution=term_frequency_distribution, is_use_cache=is_use_cache, is_use_memmap=is_use_memmap, path_working_dir=path_working_dir )
def labeledMultiDocs2TermFreqMatrix( self, labeled_documents: AvailableInputTypes, is_use_cache: bool = False, is_use_memmap: bool = False, path_working_dir: str = tempfile.mkdtemp(), joblib_backend: str = 'auto', cache_backend: str = 'PersistentDict', ngram: int = 1, n_jobs: int = 1): """* What you can do - This function makes TERM-frequency matrix for TF-IDF calculation. - TERM-frequency matrix is scipy.csr_matrix. * Params - labeled_documents: Dict object which has category-name as key, and list of features as value - is_use_cache: boolean flag to use disk-drive for keeping objects which tends to be huge. - path_working_dir: path to directory for saving cache files """ self.__check_data_structure(labeled_documents) if ngram > 1: labeled_documents = ngram_constructor.ngram_constructor( labeled_documents=labeled_documents, ngram=ngram, n_jobs=n_jobs) logger.debug(msg='Now pre-processing before CSR matrix') # convert data structure set_document_information = labeledMultiDocs2labeledDocsSet.multiDocs2TermFreqInfo( labeled_documents) # count n(docs) per label n_docs_distribution = self.count_document_distribution( labeled_documents=labeled_documents, label2id=set_document_information.label2id) # count term-frequency per label term_frequency_distribution = self.count_term_frequency_distribution( labeled_documents=labeled_documents, label2id=set_document_information.label2id) return DataCsrMatrix( csr_matrix_=set_document_information.matrix_object, label2id_dict=set_document_information.label2id, vocabulary=set_document_information.feature2id, n_docs_distribution=n_docs_distribution, n_term_freq_distribution=term_frequency_distribution, is_use_cache=is_use_cache, is_use_memmap=is_use_memmap, path_working_dir=path_working_dir, cache_backend=cache_backend)
def labeledMultiDocs2TermFreqMatrix(self, labeled_documents:AvailableInputTypes, is_use_cache:bool=False, is_use_memmap:bool=False, path_working_dir:str=None, joblib_backend:str='auto', ngram:int=1, n_jobs:int=1): """* What you can do - This function makes TERM-frequency matrix for TF-IDF calculation. - TERM-frequency matrix is scipy.csr_matrix. * Params - labeled_documents: Dict object which has category-name as key, and list of features as value - is_use_cache: boolean flag to use disk-drive for keeping objects which tends to be huge. - path_working_dir: path to directory for saving cache files """ self.__check_data_structure(labeled_documents) if ngram > 1: labeled_documents = ngram_constructor.ngram_constructor( labeled_documents=labeled_documents, ngram=ngram, n_jobs=n_jobs ) logger.debug(msg='Now pre-processing before CSR matrix') # convert data structure set_document_information = labeledMultiDocs2labeledDocsSet.multiDocs2TermFreqInfo(labeled_documents) # count n(docs) per label n_docs_distribution = self.count_document_distribution( labeled_documents=labeled_documents, label2id=set_document_information.label2id ) # count term-frequency per label term_frequency_distribution = self.count_term_frequency_distribution( labeled_documents=labeled_documents, label2id=set_document_information.label2id ) return DataCsrMatrix( csr_matrix_=set_document_information.matrix_object, label2id_dict=set_document_information.label2id, vocabulary=set_document_information.feature2id, n_docs_distribution=n_docs_distribution, n_term_freq_distribution=term_frequency_distribution, is_use_cache=is_use_cache, is_use_memmap=is_use_memmap, path_working_dir=path_working_dir )