def test_make_voc_mapping(): voc = [ ("experiment", ), ("experiments", ), ("experience"), ("experimentss", ), ] freq = [1.0, 0.5, 0.2, 0.01] voc_mapping = tokenization.make_vocabulary_mapping(voc, freq) assert voc_mapping == { ("experiments", ): ("experiment", ), ("experimentss", ): ("experiment", ), } with tempfile.TemporaryDirectory() as tmp_dir: df = pd.DataFrame({ "term": tokenization.tuple_sequence_to_strings(voc), "freq": freq }) voc_file = os.path.join(tmp_dir, "voc.csv") df.to_csv(voc_file, header=None, index=False) # voc_mapping = tokenization.load_voc_mapping(voc_file) # assert voc_mapping == {('experiments',): ('experiment',)} pipe = tokenization.tokenizing_pipeline_from_vocabulary_file( voc_file, voc_mapping="auto") assert pipe.vocabulary_mapping_.voc_mapping == { ("experiments", ): ("experiment", ), ("experimentss", ): ("experiment", ), } pipe = tokenization.tokenizing_pipeline_from_vocabulary( voc, voc_mapping="auto", frequencies=freq) assert pipe.vocabulary_mapping_.voc_mapping == { ("experiments", ): ("experiment", ), ("experimentss", ): ("experiment", ), }
def test_tokenizing_pipeline(voc_mapping, with_frequencies): tok = tokenization.tokenizing_pipeline_from_vocabulary_file( VOCABULARY_FILE, voc_mapping=voc_mapping ) if not with_frequencies: tok.frequencies = None if voc_mapping == {}: assert tok("the working memory group xyzzzz groups") == [ "working memory", "group", "groups", ] else: assert tok("the working memory group xyzzzz groups") == [ "working memory", "group", "group", ] assert tok.get_full_vocabulary( as_tuples=True ) == tokenization.string_sequence_to_tuples(tok.get_full_vocabulary()) assert tok.get_vocabulary( as_tuples=True ) == tokenization.string_sequence_to_tuples(tok.get_vocabulary()) if voc_mapping == "auto": assert len(tok.get_full_vocabulary()) == len(tok.get_vocabulary()) + 2 else: assert len(tok.get_full_vocabulary()) == len(tok.get_vocabulary()) assert len(tok.get_frequencies()) == len(tok.get_vocabulary()) if with_frequencies: assert hasattr(tok, "frequencies_") assert len(tok.get_frequencies()) == len(tok.get_vocabulary()) with tempfile.TemporaryDirectory() as tmp_dir: voc_file = os.path.join(tmp_dir, "voc_file.csv") tok.to_vocabulary_file(voc_file) loaded = tokenization.tokenizing_pipeline_from_vocabulary_file( voc_file, voc_mapping=voc_mapping ) assert ( loaded.vocabulary_mapping_.voc_mapping == tok.vocabulary_mapping_.voc_mapping ) assert loaded.get_full_vocabulary() == tok.get_full_vocabulary() assert loaded.get_vocabulary() == tok.get_vocabulary()