def test_tuples_and_strings(): tuples = [("a", "b"), ("c", ), ("de", "fg")] strings = ["a b", "c", "de fg"] assert tokenization.tuple_sequence_to_strings(tuples) == strings assert tokenization.tuple_sequence_to_strings(strings) == strings assert tokenization.string_sequence_to_tuples(strings) == tuples assert tokenization.string_sequence_to_tuples(tuples) == tuples
def test_make_voc_mapping(): voc = [ ("experiment", ), ("experiments", ), ("experience"), ("experimentss", ), ] freq = [1.0, 0.5, 0.2, 0.01] voc_mapping = tokenization.make_vocabulary_mapping(voc, freq) assert voc_mapping == { ("experiments", ): ("experiment", ), ("experimentss", ): ("experiment", ), } with tempfile.TemporaryDirectory() as tmp_dir: df = pd.DataFrame({ "term": tokenization.tuple_sequence_to_strings(voc), "freq": freq }) voc_file = os.path.join(tmp_dir, "voc.csv") df.to_csv(voc_file, header=None, index=False) # voc_mapping = tokenization.load_voc_mapping(voc_file) # assert voc_mapping == {('experiments',): ('experiment',)} pipe = tokenization.tokenizing_pipeline_from_vocabulary_file( voc_file, voc_mapping="auto") assert pipe.vocabulary_mapping_.voc_mapping == { ("experiments", ): ("experiment", ), ("experimentss", ): ("experiment", ), } pipe = tokenization.tokenizing_pipeline_from_vocabulary( voc, voc_mapping="auto", frequencies=freq) assert pipe.vocabulary_mapping_.voc_mapping == { ("experiments", ): ("experiment", ), ("experimentss", ): ("experiment", ), }