Пример #1
0
def test_GreedyAtomizer_TokenizeString_1():
    test_vocab = {"abc": 1, "a": 2, "b": 3, "ab": 4, "c": 5, "cab": 6, " ": 7}
    test_in = "abcababbaabcabcaabccccabcabccabcccabcabc"
    test_out = [
        "abc",
        "ab",
        "ab",
        "b",
        "a",
        "abc",
        "abc",
        "a",
        "abc",
        "c",
        "c",
        "cab",
        "cab",
        "c",
        "cab",
        "c",
        "c",
        "cab",
        "cab",
        "c",
    ]
    c = atomizers.GreedyAtomizer(test_vocab)
    assert c.TokenizeString(test_in) == test_out
Пример #2
0
def GreedyAtomizerFromEncodedDb(encoded_db: encoded.EncodedContentFiles):
    """Create a greedy atomizer for the vocabulary of a given encoded_db."""
    # TODO(github.com/ChrisCummins/clgen/issues/130): This should be a method of
    # a concrete `DatabaseCorpus` class.
    with encoded_db.Session() as s:
        vocab = GetVocabFromMetaTable(s)
    app.Log(1, "Loaded vocabulary of %s tokens from meta table", len(vocab))
    return atomizers.GreedyAtomizer(vocab)
Пример #3
0
def GreedyAtomizerFromEncodedDb(encoded_db: encoded.EncodedContentFiles):
  """Create a greedy atomizer for the vocabulary of a given encoded_db."""
  # TODO: This depends on the embeded "meta" table vocabulary from:
  # //experimental/deeplearning/deepsmith/java_fuzz/encode_java_corpus.py
  with encoded_db.Session() as s:
    vocab = GetVocabFromMetaTable(s)
  app.Log(1, 'Loaded vocabulary of %s tokens from meta table', len(vocab))
  return atomizers.GreedyAtomizer(vocab)
Пример #4
0
def test_GreedyAtomizer_TokenizeString_1():
    test_vocab = {'abc': 1, 'a': 2, 'b': 3, 'ab': 4, 'c': 5, 'cab': 6, ' ': 7}
    test_in = 'abcababbaabcabcaabccccabcabccabcccabcabc'
    test_out = [
        'abc', 'ab', 'ab', 'b', 'a', 'abc', 'abc', 'a', 'abc', 'c', 'c', 'cab',
        'cab', 'c', 'cab', 'c', 'c', 'cab', 'cab', 'c'
    ]
    c = atomizers.GreedyAtomizer(test_vocab)
    assert c.TokenizeString(test_in) == test_out
Пример #5
0
def test_GreedyAtomizer_TokenizeString_2():
    test_vocab = {"volatile": 0, "voletile": 1, "vo": 2, " ": 3, "l": 4}
    test_in = "volatile voletile vol "
    test_out = ["volatile", " ", "voletile", " ", "vo", "l", " "]
    c = atomizers.GreedyAtomizer(test_vocab)
    assert c.TokenizeString(test_in) == test_out
Пример #6
0
def test_GreedyAtomizer_TokenizeString_2():
    test_vocab = {'volatile': 0, 'voletile': 1, 'vo': 2, ' ': 3, 'l': 4}
    test_in = 'volatile voletile vol '
    test_out = ['volatile', ' ', 'voletile', ' ', 'vo', 'l', ' ']
    c = atomizers.GreedyAtomizer(test_vocab)
    assert c.TokenizeString(test_in) == test_out