def test_check_same_csr_matrix(self): """複数回の変換を実施して、同一のcsr_matrixになることを確認する """ n_joblib_tasks = 2 data_csr_matrix1 = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix( labeled_documents=self.input_dict, ngram=1, n_jobs=n_joblib_tasks ) assert isinstance(data_csr_matrix1, DataCsrMatrix) csr_matrix_1 = data_csr_matrix1.csr_matrix_ label_group_dict_1 = data_csr_matrix1.label2id_dict vocabulary_1 = data_csr_matrix1.vocabulary n_doc_distri_1 = data_csr_matrix1.n_docs_distribution n_term_distir_1 = data_csr_matrix1.n_term_freq_distribution dense_matrix_1 = csr_matrix_1.toarray() data_csr_matrix2 = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix( labeled_documents=self.input_dict, ngram=1, n_jobs=n_joblib_tasks ) assert isinstance(data_csr_matrix2, DataCsrMatrix) csr_matrix_2 = data_csr_matrix2.csr_matrix_ label_group_dict_2 = data_csr_matrix2.label2id_dict vocabulary_2 = data_csr_matrix2.vocabulary n_doc_distri_2 = data_csr_matrix2.n_docs_distribution n_term_distir_2 = data_csr_matrix2.n_term_freq_distribution dense_matrix_2 = data_csr_matrix2.csr_matrix_.toarray() data_csr_matrix3 = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix( labeled_documents=self.input_dict, ngram=1, n_jobs=n_joblib_tasks ) assert isinstance(data_csr_matrix3, DataCsrMatrix) csr_matrix_3 = data_csr_matrix3.csr_matrix_ label_group_dict_3 = data_csr_matrix3.label2id_dict vocabulary_3 = data_csr_matrix3.vocabulary n_doc_distri_3 = data_csr_matrix3.n_docs_distribution n_term_distir_3 = data_csr_matrix3.n_term_freq_distribution dense_matrix_3 = data_csr_matrix3.csr_matrix_.toarray() assert numpy.array_equal(dense_matrix_1, dense_matrix_2) assert numpy.array_equal(dense_matrix_2, dense_matrix_3) assert numpy.array_equal(dense_matrix_1, dense_matrix_3) assert vocabulary_1 == vocabulary_2 assert vocabulary_2 == vocabulary_3 assert vocabulary_1 == vocabulary_3
def setUp(self): input_dict = { "label_a": [["I", "aa", "aa", "aa", "aa", "aa"], ["bb", "aa", "aa", "aa", "aa", "aa"], ["I", "aa", "hero", "some", "ok", "aa"]], "label_b": [ ["bb", "bb", "bb"], ["bb", "bb", "bb"], ["hero", "ok", "bb"], ["hero", "cc", "bb"], ], "label_c": [ ["cc", "cc", "cc"], ["cc", "cc", "bb"], ["xx", "xx", "cc"], ["aa", "xx", "cc"], ] } data_csr_matrix = data_converter.DataConverter( ).convert_multi_docs2document_frequency_matrix( labeled_documents=input_dict, n_jobs=5) assert isinstance(data_csr_matrix, DataCsrMatrix) self.label2id_dict = data_csr_matrix.label2id_dict self.csr_matrix_ = data_csr_matrix.csr_matrix_ self.n_docs_distribution = data_csr_matrix.n_docs_distribution self.vocabulary = data_csr_matrix.vocabulary
def test_bns_cython(self): incorrect_input_dict = { "label_a": [ ["I", "aa", "aa", "aa", "aa", "aa"], ["bb", "aa", "aa", "aa", "aa", "aa"], ["I", "aa", "hero", "some", "ok", "aa"] ], "label_b": [ ["bb", "bb", "bb"], ["bb", "bb", "bb"], ["hero", "ok", "bb"], ["hero", "cc", "bb"], ["cc", "cc", "cc"], ["cc", "cc", "bb"], ["xx", "xx", "cc"], ["aa", "xx", "cc"], ] } data_csr_matrix = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix( labeled_documents=incorrect_input_dict, n_jobs=5 ) assert isinstance(data_csr_matrix, DataCsrMatrix) csr_matrix_ = data_csr_matrix.csr_matrix_ n_docs_distribution = data_csr_matrix.n_docs_distribution result_bns = bns_python3.BNS().fit_transform(X=csr_matrix_, y=None, unit_distribution=n_docs_distribution, use_cython=True) print(result_bns)
def test_check_input_error(self): incorrect_input_dict = { "label_a": [ ["I", "aa", "aa", "aa", "aa", "aa"], ["bb", "aa", "aa", "aa", "aa", "aa"], ["I", "aa", "hero", "some", "ok", "aa"] ], "label_b": [ ["bb", "bb", "bb"], ["bb", "bb", "bb"], ["hero", "ok", "bb"], ["hero", "cc", "bb"], ["cc", "cc", "cc"], ["cc", "cc", "bb"], ["xx", "xx", "cc"], ["aa", "xx", "cc"], ], "label_c":[ ["aa", "xx", "cc"] ] } data_csr_matrix = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix( labeled_documents=incorrect_input_dict, n_jobs=5 ) assert isinstance(data_csr_matrix, DataCsrMatrix) csr_matrix_ = data_csr_matrix.csr_matrix_ n_docs_distribution = data_csr_matrix.n_docs_distribution try: bns_python3.BNS().fit_transform(X=csr_matrix_, y=None, unit_distribution=n_docs_distribution) except: pass
def test_fit_transform(self): data_csr_matrix = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix( labeled_documents=self.correct_input, n_jobs=5 ) assert isinstance(data_csr_matrix, DataCsrMatrix) label2id_dict = data_csr_matrix.label2id_dict csr_matrix_ = data_csr_matrix.csr_matrix_ n_docs_distribution = data_csr_matrix.n_docs_distribution vocabulary = data_csr_matrix.vocabulary bns_score_csr_matrix = bns_python3.BNS().fit_transform(X=csr_matrix_, y=None, unit_distribution=n_docs_distribution, verbose=True) assert isinstance(bns_score_csr_matrix, csr_matrix) bns_scores_dict = ScoredResultObject( scored_matrix=bns_score_csr_matrix, label2id_dict=label2id_dict, feature2id_dict=vocabulary ).ScoreMatrix2ScoreDictionary() assert isinstance(bns_scores_dict, list) import pprint pprint.pprint(bns_scores_dict)
def test_complex_feature_convertion(self): """""" csr_matrix_information = data_converter.DataConverter( ).labeledMultiDocs2DocFreqMatrix( labeled_documents=self.input_dict_complex_feature, n_jobs=1) assert isinstance(csr_matrix_information, DataCsrMatrix) csr_matrix_ = csr_matrix_information.csr_matrix_ label_group_dict = csr_matrix_information.label2id_dict vocabulary = csr_matrix_information.vocabulary assert isinstance(csr_matrix_, csr_matrix) assert isinstance(label_group_dict, dict) assert isinstance(vocabulary, dict) n_correct_sample = 3 n_correct_feature = 5 assert csr_matrix_.shape[0] == n_correct_sample assert csr_matrix_.shape[1] == n_correct_feature dense_matrix_constructed_matrix = csr_matrix_.toarray() # vocaburary id of correct matrix is {'cc': 3, 'aa': 1, 'some': 6, 'xx': 7, 'I': 0, 'ok': 5, 'hero': 4, 'bb': 2} # label id of correct matrix is {'label_c': 2, 'label_a': 0, 'label_b': 1} correct_array_numpy = numpy.array([ [1.0, 3.0, 0.0, 2.0, 0.0], [0.0, 0.0, 1.0, 3.0, 1.0], [1.0, 1.0, 0.0, 2.0, 1.0], ]).astype(numpy.int64) assert numpy.array_equal(correct_array_numpy, dense_matrix_constructed_matrix)
def setUp(self): input_dict = { "label_a": [["I", "aa", "aa", "aa", "aa", "aa"], ["bb", "aa", "aa", "aa", "aa", "aa"], ["I", "aa", "hero", "some", "ok", "aa"]], "label_b": [ ["bb", "bb", "bb"], ["bb", "bb", "bb"], ["hero", "ok", "bb"], ["hero", "cc", "bb"], ], "label_c": [ ["cc", "cc", "cc"], ["cc", "cc", "bb"], ["xx", "xx", "cc"], ["aa", "xx", "cc"], ] } tf_matrix = numpy.array([[2, 12, 1, 0, 1, 1, 1, 0], [0, 0, 8, 1, 2, 1, 0, 0], [0, 1, 1, 7, 0, 0, 0, 3]]) data_csr_matrix = data_converter.DataConverter( ).labeledMultiDocs2DocFreqMatrix(labeled_documents=input_dict, ngram=1, n_jobs=-1) assert isinstance(data_csr_matrix, DataCsrMatrix) self.label2id_dict = data_csr_matrix.label2id_dict self.csr_matrix_ = data_csr_matrix.csr_matrix_ self.n_docs_distribution = data_csr_matrix.n_docs_distribution self.vocabulary = data_csr_matrix.vocabulary numpy.array_equal(data_csr_matrix.csr_matrix_.toarray(), tf_matrix)
def test_multi_process_convert_data(self): """checks if it works or not when n_process is more than 1 :return: """ data_csr_object = data_converter.DataConverter( ).labeledMultiDocs2DocFreqMatrix(labeled_documents=self.input_dict, n_jobs=5) assert isinstance(data_csr_object.csr_matrix_, csr_matrix) assert isinstance(data_csr_object.label2id_dict, dict) assert isinstance(data_csr_object.vocabulary, dict)
def test_soa_doc_freq(self): data_csr_matrix = data_converter.DataConverter( ).convert_multi_docs2document_frequency_matrix( labeled_documents=self.input_dict, n_jobs=5) assert isinstance(data_csr_matrix, data_converter.DataCsrMatrix) label2id_dict = data_csr_matrix.label2id_dict csr_matrix_ = data_csr_matrix.csr_matrix_ n_docs_distribution = data_csr_matrix.n_docs_distribution vocabulary = data_csr_matrix.vocabulary scored_matrix_doc_freq = soa_python3.SOA().fit_transform( X=csr_matrix_, unit_distribution=n_docs_distribution, verbose=True) soa_scores_doc_freq = ScoredResultObject( scored_matrix=scored_matrix_doc_freq, label2id_dict=label2id_dict, feature2id_dict=vocabulary).convert_score_matrix2score_record() self.assertTrue(isinstance(soa_scores_doc_freq, list))
def test_soa_doc_freq(self): data_csr_matrix = data_converter.DataConverter( ).labeledMultiDocs2DocFreqMatrix(labeled_documents=self.input_dict, n_jobs=5) assert isinstance(data_csr_matrix, data_converter.DataCsrMatrix) label2id_dict = data_csr_matrix.label2id_dict csr_matrix_ = data_csr_matrix.csr_matrix_ n_docs_distribution = data_csr_matrix.n_docs_distribution vocabulary = data_csr_matrix.vocabulary scored_matrix_doc_freq = soa_python3.SOA().fit_transform( X=csr_matrix_, unit_distribution=n_docs_distribution, verbose=True) soa_scores_doc_freq = ScoredResultObject( scored_matrix=scored_matrix_doc_freq, label2id_dict=label2id_dict, feature2id_dict=vocabulary).ScoreMatrix2ScoreDictionary() import pprint print('doc freq based soa') pprint.pprint(soa_scores_doc_freq)
def test_basic_convert_data(self): """checks it works of not when n_jobs=1, n_process=1 data convert過程のミスが疑われるので、整合性のチェックをする :return: """ csr_matrix_information = data_converter.DataConverter().labeledMultiDocs2DocFreqMatrix( labeled_documents=self.input_dict, ngram=1, n_jobs=5 ) assert isinstance(csr_matrix_information, DataCsrMatrix) csr_matrix_ = csr_matrix_information.csr_matrix_ label_group_dict = csr_matrix_information.label2id_dict vocabulary = csr_matrix_information.vocabulary assert isinstance(csr_matrix_, csr_matrix) assert isinstance(label_group_dict, dict) assert isinstance(vocabulary, dict) n_correct_sample = 3 n_correct_featute = 8 assert csr_matrix_.shape[0] == n_correct_sample assert csr_matrix_.shape[1] == n_correct_featute dense_matrix_constructed_matrix = csr_matrix_.toarray() # vocaburary id of correct matrix is {'cc': 3, 'aa': 1, 'some': 6, 'xx': 7, 'I': 0, 'ok': 5, 'hero': 4, 'bb': 2} # label id of correct matrix is {'label_c': 2, 'label_a': 0, 'label_b': 1} correct_array_numpy = numpy.array( [[2, 3, 1, 0, 1, 1, 1, 0], [0, 0, 4, 1, 2, 1, 0, 0], [0, 1, 1, 4, 0, 0, 0, 2] ]).astype(numpy.int64) assert numpy.array_equal(correct_array_numpy, dense_matrix_constructed_matrix)
def test_fit_transform(self): data_csr_matrix = data_converter.DataConverter( ).convert_multi_docs2document_frequency_matrix( labeled_documents=self.correct_input, n_jobs=5) assert isinstance(data_csr_matrix, DataCsrMatrix) label2id_dict = data_csr_matrix.label2id_dict csr_matrix_ = data_csr_matrix.csr_matrix_ n_docs_distribution = data_csr_matrix.n_docs_distribution vocabulary = data_csr_matrix.vocabulary bns_score_csr_matrix = bns_python3.BNS().fit_transform( X=csr_matrix_, y=None, unit_distribution=n_docs_distribution, verbose=True) assert isinstance(bns_score_csr_matrix, csr_matrix) bns_scores_dict = ScoredResultObject( scored_matrix=bns_score_csr_matrix, label2id_dict=label2id_dict, feature2id_dict=vocabulary).convert_score_matrix2score_record() self.assertTrue(bns_scores_dict, list)
def run_feature_selection(input_dict: AvailableInputTypes, method: str, use_cython: bool = False, is_use_cache: bool = False, is_use_memmap: bool = False, cache_backend: str = 'PersistentDict', path_working_dir: str = None, matrix_form=None, n_jobs: int = 1) -> ScoredResultObject: """A interface function of DocumentFeatureSelection package. * Args - input_dict: Dict-object which has category-name as key and list of features as value. - You can put dict or sqlitedict.SqliteDict, or DocumentFeatureSelection.models.PersistentDict - method: A method name of feature selection metric - use_cython: boolean flag to use cython code for computation. It's much faster to use cython than native-python code - is_use_cache: boolean flag to use disk-drive for keeping objects which tends to be huge. - is_use_memmap: boolean flag to use memmap for keeping matrix object. - path_working_dir: str object. - The file path to directory where you save cache file or memmap matrix object. If you leave it None, it finds some directory and save files in it. - cache_backend - Named of cache backend if you put True on is_use_cache. [PersistentDict, SqliteDict] """ if method not in METHOD_NAMES: raise Exception('method name must be either of {}. Yours: {}'.format( METHOD_NAMES, method)) if (is_use_cache or is_use_memmap) and path_working_dir is None: path_working_dir = mkdtemp() logger.info( "Temporary files are created under {}".format(path_working_dir)) if method == 'tf_idf': """You get scored-matrix with term-frequency. ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix """ matrix_data_object = data_converter.DataConverter( ).convert_multi_docs2term_frequency_matrix( labeled_documents=input_dict, n_jobs=n_jobs, is_use_cache=is_use_cache, is_use_memmap=is_use_memmap, path_working_dir=path_working_dir, cache_backend=cache_backend) assert isinstance(matrix_data_object, DataCsrMatrix) scored_sparse_matrix = TFIDF().fit_transform( X=matrix_data_object.csr_matrix_) assert isinstance(scored_sparse_matrix, csr_matrix) elif method in ['soa', 'pmi'] and matrix_form is None: """You get scored-matrix with either of soa or pmi. """ matrix_data_object = data_converter.DataConverter( ).convert_multi_docs2document_frequency_matrix( labeled_documents=input_dict, n_jobs=n_jobs, is_use_cache=is_use_cache, is_use_memmap=is_use_memmap, path_working_dir=path_working_dir) assert isinstance(matrix_data_object, DataCsrMatrix) if method == 'pmi': backend_strategy = decide_joblib_strategy( matrix_data_object.vocabulary) scored_sparse_matrix = PMI().fit_transform( X=matrix_data_object.csr_matrix_, n_docs_distribution=matrix_data_object.n_docs_distribution, n_jobs=n_jobs, joblib_backend=backend_strategy, use_cython=use_cython) assert isinstance(scored_sparse_matrix, csr_matrix) elif method == 'soa': backend_strategy = decide_joblib_strategy( matrix_data_object.vocabulary) scored_sparse_matrix = SOA().fit_transform( X=matrix_data_object.csr_matrix_, unit_distribution=matrix_data_object.n_docs_distribution, n_jobs=n_jobs, joblib_backend=backend_strategy, use_cython=use_cython) assert isinstance(scored_sparse_matrix, csr_matrix) else: raise Exception() elif method == 'soa' and matrix_form == 'term_freq': # You get score-matrix with soa from term-frequency matrix. # ATTENTION: the input for TF-IDF MUST be term-frequency matrix. NOT document-frequency matrix matrix_data_object = data_converter.DataConverter( ).convert_multi_docs2term_frequency_matrix( labeled_documents=input_dict, n_jobs=n_jobs, is_use_cache=is_use_cache, is_use_memmap=is_use_memmap, path_working_dir=path_working_dir) assert isinstance(matrix_data_object, DataCsrMatrix) backend_strategy = decide_joblib_strategy( matrix_data_object.vocabulary) scored_sparse_matrix = SOA().fit_transform( X=matrix_data_object.csr_matrix_, unit_distribution=matrix_data_object.n_docs_distribution, n_jobs=n_jobs, joblib_backend=backend_strategy) assert isinstance(scored_sparse_matrix, csr_matrix) elif method == 'bns': # You get scored-matrix with bns. # ATTENTION: #label should be 2 always. # Consider shorter label name as positive label # (positive and negative does NOT have any meaning in this context) # positive_label_name = sorted(input_dict.keys(), key=lambda x: len(x))[0] if len(input_dict.keys()) >= 3: raise KeyError( 'input_dict must not have more than 3 keys if you would like to use BNS.' ) matrix_data_object = data_converter.DataConverter( ).convert_multi_docs2document_frequency_matrix( labeled_documents=input_dict, n_jobs=n_jobs, is_use_cache=is_use_cache, is_use_memmap=is_use_memmap, path_working_dir=path_working_dir) assert isinstance(matrix_data_object, DataCsrMatrix) true_class_index = matrix_data_object.label2id_dict[ positive_label_name] backend_strategy = decide_joblib_strategy( matrix_data_object.vocabulary) scored_sparse_matrix = BNS().fit_transform( X=matrix_data_object.csr_matrix_, unit_distribution=matrix_data_object.n_term_freq_distribution, n_jobs=n_jobs, true_index=true_class_index, joblib_backend=backend_strategy, use_cython=use_cython) assert isinstance(scored_sparse_matrix, csr_matrix) else: raise Exception() logger.info('Done computation.') # delete tmp file directory if is_use_cache or is_use_memmap: logger.debug("Delete temporary files {}".format(path_working_dir)) shutil.rmtree(path_working_dir) return ScoredResultObject(scored_matrix=scored_sparse_matrix, label2id_dict=matrix_data_object.label2id_dict, feature2id_dict=matrix_data_object.vocabulary, method=method, matrix_form=matrix_form, frequency_matrix=matrix_data_object.csr_matrix_)
def test_get_pmi_feature_dictionary(self): """checks if it works or not, that getting scored dictionary object from scored_matrix :return: """ data_csr_object = data_converter.DataConverter( ).labeledMultiDocs2DocFreqMatrix(labeled_documents=self.input_dict, ngram=1, n_jobs=5) assert isinstance(data_csr_object.csr_matrix_, csr_matrix) assert isinstance(data_csr_object.label2id_dict, dict) assert isinstance(data_csr_object.vocabulary, dict) pmi_scored_matrix = PMI_python3.PMI().fit_transform( X=data_csr_object.csr_matrix_, n_jobs=5, n_docs_distribution=data_csr_object.n_docs_distribution) # main part of test # when sort is True, cut_zero is True, outformat is dict pmi_scored_dictionary_objects = ScoredResultObject( scored_matrix=pmi_scored_matrix, label2id_dict=data_csr_object.label2id_dict, feature2id_dict=data_csr_object.vocabulary ).ScoreMatrix2ScoreDictionary(outformat='dict', sort_desc=True, n_jobs=5) assert isinstance(pmi_scored_dictionary_objects, dict) logging.debug(pmi_scored_dictionary_objects) # when sort is True, cut_zero is True, outformat is items pmi_scored_dictionary_objects = ScoredResultObject( scored_matrix=pmi_scored_matrix, label2id_dict=data_csr_object.label2id_dict, feature2id_dict=data_csr_object.vocabulary ).ScoreMatrix2ScoreDictionary(outformat='items', sort_desc=True, n_jobs=5) assert isinstance(pmi_scored_dictionary_objects, list) for d in pmi_scored_dictionary_objects: assert isinstance(d, dict) logging.debug(pmi_scored_dictionary_objects) # when sort is True, cut_zero is False, outformat is dict pmi_scored_dictionary_objects = ScoredResultObject( scored_matrix=pmi_scored_matrix, label2id_dict=data_csr_object.label2id_dict, feature2id_dict=data_csr_object.vocabulary ).ScoreMatrix2ScoreDictionary(outformat='dict', sort_desc=True, n_jobs=5) assert isinstance(pmi_scored_dictionary_objects, dict) logging.debug(pmi_scored_dictionary_objects) # when sort is True, cut_zero is False, outformat is items pmi_scored_dictionary_objects = ScoredResultObject( scored_matrix=pmi_scored_matrix, label2id_dict=data_csr_object.label2id_dict, feature2id_dict=data_csr_object.vocabulary ).ScoreMatrix2ScoreDictionary(outformat='items', sort_desc=True, n_jobs=5) assert isinstance(pmi_scored_dictionary_objects, list) for d in pmi_scored_dictionary_objects: assert isinstance(d, dict) logging.debug(pmi_scored_dictionary_objects)