def test_token2id_store_and_load(): os.makedirs('./tests/output', exist_ok=True) token2id: Token2Id = Token2Id().ingest(TEST_TOKENS_STREAM1) filename = './tests/output/test_vocabulary.zip' tf_filename = path_add_suffix(filename, "_tf", new_extension=".pbz2") token2id.store(filename=filename) assert os.path.isfile(filename) and os.path.isfile(tf_filename) token2id_loaded: Token2Id = Token2Id.load(filename=filename) assert token2id_loaded is not None assert token2id_loaded.tf is not None assert token2id_loaded.data == { 'adam': 0, 'anton': 1, 'beatrice': 2, 'felicia': 3, 'niklas': 4 } assert dict(token2id_loaded.tf) == {0: 3, 1: 2, 2: 1, 3: 1, 4: 1}
def test_interfaces_token2id_store(): os.makedirs('./tests/output', exist_ok=True) filename: str = './tests/output/test_interfaces_token2id_store.zip' token2id = Token2Id() token2id.ingest(['apa', 'banan', 'soffa']) token2id.store(filename) assert pathlib.Path(filename).exists() token2id_loaded: Token2Id = Token2Id.load(filename) assert token2id.data == token2id_loaded.data
def test_co_occurrences_to_co_occurrence_corpus(): folder, tag = './tests/test_data/ABCDEFG_7DOCS_CONCEPT', "ABCDEFG_7DOCS_CONCEPT" co_occurrences: CoOccurrenceDataFrame = co_occurrence.load_co_occurrences( co_occurrence_filename(folder, tag)) document_index: DocumentIndex = DocumentIndexHelper.load( document_index_filename(folder, tag)).document_index token2id: Token2Id = Token2Id.load(vocabulary_filename(folder, tag)) corpus = LegacyCoOccurrenceMixIn.from_co_occurrences( co_occurrences=co_occurrences, document_index=document_index, token2id=token2id, ) assert corpus.data.sum() == co_occurrences.value.sum() assert corpus.data.shape[0] == len(document_index) assert corpus.data.shape[1] == len(co_occurrences[["w1_id", "w2_id" ]].drop_duplicates())