def test_compute_and_store_bundle(): tag: str = f'{uuid.uuid4()}' target_folder: str = jj(OUTPUT_FOLDER, tag) target_filename: str = co_occurrence.to_filename(folder=target_folder, tag=tag) os.makedirs(target_folder, exist_ok=True) simple_corpus = very_simple_corpus(SIMPLE_CORPUS_ABCDEFG_3DOCS) context_opts: co_occurrence.ContextOpts = co_occurrence.ContextOpts( concept={'g'}, ignore_concept=False, context_width=2) bundle: co_occurrence.Bundle = test_utils.create_simple_bundle_by_pipeline( data=simple_corpus, context_opts=context_opts, folder=target_folder, tag=tag, ) bundle.store() assert os.path.isfile(target_filename) shutil.rmtree(target_folder, ignore_errors=True)
def test_term_term_matrix_to_co_occurrences_with_ignore_ids(): text_corpus = very_simple_corpus(data=[ ('tran_2019_01_test.txt', ['*', 'b', 'c', 'c']), ('tran_2019_02_test.txt', ['a', '*', '*', 'd']), ('tran_2019_03_test.txt', ['a', 'e', 'e', 'b']), ('tran_2020_01_test.txt', ['*', 'c', 'd', 'a']), ('tran_2020_02_test.txt', ['a', 'b', '*', '*']), ]) token2id: Token2Id = Token2Id(text_corpus.token2id) term_term_matrix = (dtm.CorpusVectorizer().fit_transform( text_corpus, already_tokenized=True, vocabulary=text_corpus.token2id).co_occurrence_matrix()) pad_id = token2id['*'] co_occurrences = term_term_matrix_to_co_occurrences( term_term_matrix=term_term_matrix, threshold_count=1, ignore_ids=set([pad_id]), ) assert not (co_occurrences.w1_id == pad_id).any() assert not (co_occurrences.w2_id == pad_id).any()
def test_to_co_occurrence_matrix(): text_corpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS) term_term_matrix1 = very_simple_term_term_matrix(text_corpus) term_term_matrix2 = co_occurrence.to_co_occurrence_matrix(text_corpus) assert (term_term_matrix1 != term_term_matrix2).nnz == 0
def test_co_occurrence_matrix_of_corpus_returns_correct_result(): expected_token2id = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4} expected_matrix = np.matrix([[0, 6, 4, 3, 3], [0, 0, 2, 1, 4], [0, 0, 0, 2, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]) corpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS) v_corpus = CorpusVectorizer().fit_transform(corpus, already_tokenized=True, vocabulary=corpus.token2id) term_term_matrix = v_corpus.co_occurrence_matrix() assert (term_term_matrix.todense() == expected_matrix).all() assert expected_token2id == v_corpus.token2id
def test_term_term_matrix_to_co_occurrences_with_no_ignore_ids(): text_corpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS) term_term_matrix: scipy.sparse.spmatrix = very_simple_term_term_matrix( text_corpus) co_occurrences = term_term_matrix_to_co_occurrences( term_term_matrix=term_term_matrix, threshold_count=1, ignore_ids=None, ) fg = text_corpus.token2id.get assert co_occurrences.value.sum() == term_term_matrix.sum() assert 4 == int(co_occurrences[((co_occurrences.w1_id == fg('a')) & (co_occurrences.w2_id == fg('c')))].value) assert 1 == int(co_occurrences[((co_occurrences.w1_id == fg('b')) & (co_occurrences.w2_id == fg('d')))].value)
def test_TTM_to_co_occurrence_DTM_using_LIL_matrix(): source_corpus = very_simple_corpus(SIMPLE_CORPUS_ABCDE_5DOCS) token2id = Token2Id(source_corpus.token2id) document_index: DocumentIndex = source_corpus.document_index stream: Iterable[CoOccurrencePayload] = (CoOccurrencePayload( document_id, document_name="-", ttm_data_map={ VectorizeType.Normal: VectorizedTTM( vectorize_type=VectorizeType.Normal, term_term_matrix=CorpusVectorizer().fit_transform( [doc], already_tokenized=True, vocabulary=token2id.data).co_occurrence_matrix(), term_window_counts={}, document_id=document_id, ) }, ) for document_id, doc in enumerate(source_corpus)) pair2id: Token2Id = Token2Id() builder: CoOccurrenceCorpusBuilder = CoOccurrenceCorpusBuilder( vectorize_type=VectorizeType.Normal, document_index=document_index, pair2id=pair2id, token2id=token2id, ) for payload in stream: builder.ingest_pairs(payload).add(payload) corpus: VectorizedCorpus = builder.corpus assert corpus is not None