예제 #1
0
def test_token2id_store_and_load():

    os.makedirs('./tests/output', exist_ok=True)

    token2id: Token2Id = Token2Id().ingest(TEST_TOKENS_STREAM1)

    filename = './tests/output/test_vocabulary.zip'
    tf_filename = path_add_suffix(filename, "_tf", new_extension=".pbz2")

    token2id.store(filename=filename)

    assert os.path.isfile(filename) and os.path.isfile(tf_filename)

    token2id_loaded: Token2Id = Token2Id.load(filename=filename)

    assert token2id_loaded is not None
    assert token2id_loaded.tf is not None

    assert token2id_loaded.data == {
        'adam': 0,
        'anton': 1,
        'beatrice': 2,
        'felicia': 3,
        'niklas': 4
    }
    assert dict(token2id_loaded.tf) == {0: 3, 1: 2, 2: 1, 3: 1, 4: 1}
예제 #2
0
def test_interfaces_token2id_store():
    os.makedirs('./tests/output', exist_ok=True)

    filename: str = './tests/output/test_interfaces_token2id_store.zip'
    token2id = Token2Id()

    token2id.ingest(['apa', 'banan', 'soffa'])
    token2id.store(filename)

    assert pathlib.Path(filename).exists()

    token2id_loaded: Token2Id = Token2Id.load(filename)

    assert token2id.data == token2id_loaded.data
예제 #3
0
def test_co_occurrences_to_co_occurrence_corpus():

    folder, tag = './tests/test_data/ABCDEFG_7DOCS_CONCEPT', "ABCDEFG_7DOCS_CONCEPT"

    co_occurrences: CoOccurrenceDataFrame = co_occurrence.load_co_occurrences(
        co_occurrence_filename(folder, tag))
    document_index: DocumentIndex = DocumentIndexHelper.load(
        document_index_filename(folder, tag)).document_index
    token2id: Token2Id = Token2Id.load(vocabulary_filename(folder, tag))

    corpus = LegacyCoOccurrenceMixIn.from_co_occurrences(
        co_occurrences=co_occurrences,
        document_index=document_index,
        token2id=token2id,
    )

    assert corpus.data.sum() == co_occurrences.value.sum()
    assert corpus.data.shape[0] == len(document_index)
    assert corpus.data.shape[1] == len(co_occurrences[["w1_id", "w2_id"
                                                       ]].drop_duplicates())