예제 #1
0
def test_make_voc_mapping():
    voc = [
        ("experiment", ),
        ("experiments", ),
        ("experience"),
        ("experimentss", ),
    ]
    freq = [1.0, 0.5, 0.2, 0.01]
    voc_mapping = tokenization.make_vocabulary_mapping(voc, freq)
    assert voc_mapping == {
        ("experiments", ): ("experiment", ),
        ("experimentss", ): ("experiment", ),
    }
    with tempfile.TemporaryDirectory() as tmp_dir:
        df = pd.DataFrame({
            "term": tokenization.tuple_sequence_to_strings(voc),
            "freq": freq
        })
        voc_file = os.path.join(tmp_dir, "voc.csv")
        df.to_csv(voc_file, header=None, index=False)
        # voc_mapping = tokenization.load_voc_mapping(voc_file)
        # assert voc_mapping == {('experiments',): ('experiment',)}
        pipe = tokenization.tokenizing_pipeline_from_vocabulary_file(
            voc_file, voc_mapping="auto")
        assert pipe.vocabulary_mapping_.voc_mapping == {
            ("experiments", ): ("experiment", ),
            ("experimentss", ): ("experiment", ),
        }
        pipe = tokenization.tokenizing_pipeline_from_vocabulary(
            voc, voc_mapping="auto", frequencies=freq)
        assert pipe.vocabulary_mapping_.voc_mapping == {
            ("experiments", ): ("experiment", ),
            ("experimentss", ): ("experiment", ),
        }
예제 #2
0
def test_tokenizing_pipeline(voc_mapping, with_frequencies):
    tok = tokenization.tokenizing_pipeline_from_vocabulary_file(
        VOCABULARY_FILE, voc_mapping=voc_mapping
    )
    if not with_frequencies:
        tok.frequencies = None
    if voc_mapping == {}:
        assert tok("the working memory group xyzzzz groups") == [
            "working memory",
            "group",
            "groups",
        ]
    else:
        assert tok("the working memory group xyzzzz groups") == [
            "working memory",
            "group",
            "group",
        ]
    assert tok.get_full_vocabulary(
        as_tuples=True
    ) == tokenization.string_sequence_to_tuples(tok.get_full_vocabulary())
    assert tok.get_vocabulary(
        as_tuples=True
    ) == tokenization.string_sequence_to_tuples(tok.get_vocabulary())
    if voc_mapping == "auto":
        assert len(tok.get_full_vocabulary()) == len(tok.get_vocabulary()) + 2
    else:
        assert len(tok.get_full_vocabulary()) == len(tok.get_vocabulary())
    assert len(tok.get_frequencies()) == len(tok.get_vocabulary())
    if with_frequencies:
        assert hasattr(tok, "frequencies_")
        assert len(tok.get_frequencies()) == len(tok.get_vocabulary())
    with tempfile.TemporaryDirectory() as tmp_dir:
        voc_file = os.path.join(tmp_dir, "voc_file.csv")
        tok.to_vocabulary_file(voc_file)
        loaded = tokenization.tokenizing_pipeline_from_vocabulary_file(
            voc_file, voc_mapping=voc_mapping
        )
        assert (
            loaded.vocabulary_mapping_.voc_mapping
            == tok.vocabulary_mapping_.voc_mapping
        )
        assert loaded.get_full_vocabulary() == tok.get_full_vocabulary()
        assert loaded.get_vocabulary() == tok.get_vocabulary()