示例#1
0
def test_unseen_word_ending():
    """ The last character should come with a </w> even if it wasn't seen as the last letter of a 
        word in the training set.
    """
    encoder = Encoder(pct_bpe=1, ngram_max=4)
    encoder.fit(test_corpus)
    assert encoder.tokenize('import toolz') == [SOW, 'impo', 'rt', EOW, SOW, 'tool', 'z', EOW]
def test_mixed_encoder():
    encoder = Encoder(vocab_size=1000, pct_bpe=0.98, ngram_max=4)
    encoder.fit(test_corpus)
    assert encoder.tokenize('import this yield toolz') == [
        'import', SOW, 'th', 'is', EOW, SOW, 'yiel', 'd', EOW, SOW, 'tool',
        'z', EOW
    ]
def test_bpe_encoder_fit():
    """ Encoer should be able to fit to provided text data. """
    encoder = Encoder(pct_bpe=1, ngram_max=4)
    encoder.fit(test_corpus)
    assert encoder.tokenize('from toolz import reduce') == [
        SOW, 'f', 'ro', 'm', EOW, SOW, 'tool', 'z', EOW, SOW, 'impo', 'rt',
        EOW, SOW, 'redu', 'ce', EOW
    ]
示例#4
0
def main(corpus_path):
    # type: (str) -> None
    """ Loads corpus, learns word and BPE vocab, and writes to stdout.  Assumes corpus is
        line-separated text.
    """
    with open(corpus_path) as infile:
        lines = list(map(str.strip, infile))

    encoder = Encoder(silent=True)
    encoder.fit(lines)
    print(json.dumps(encoder.vocabs_to_dict()))
def test_encoder_creation_graceful_failure(vocab_size):
    """ Min vocab size is 1.  Anything lower should ValueError """
    died = False

    try:
        Encoder(vocab_size=vocab_size)
    except ValueError:
        died = True

    assert died, "Encoder should have raised a ValueError for < 1 vocab size"
def test_strict_mode():
    strict_encoder = Encoder(pct_bpe=1, strict=True)
    strict_encoder.fit(test_corpus)
    failed = False
    idxs = [[9]]
    try:
        list(strict_encoder.inverse_transform(idxs))
    except ValueError:
        failed = True
    assert failed, 'Should have failed to inverse transform word due to strict mode'

    non_strict_encoder = Encoder(pct_bpe=1, strict=False)
    non_strict_encoder.fit(test_corpus)
    failed = False
    idxs = [[9]]
    try:
        list(non_strict_encoder.inverse_transform(idxs))
    except ValueError:
        failed = True
    assert not failed, 'Should not have failed to inverse transform word due to non-strict mode'
def test_unknown_char_handling():
    encoder = Encoder(pct_bpe=1)
    encoder.fit(test_corpus)

    result = list(encoder.inverse_transform(encoder.transform([';'])))[0]
    assert encoder.UNK in result
    assert ';' not in result
示例#8
0
    def load(self, model_name, force_update=False):
        """
        Use this function to automatically download and load a new model. It will automatically check online if a newer version is available.
        :param model_name: identifier for the model to be loaded
        :param force_update: use this flag to trigger forceful model update - useful if you have an unhealthy model store
        :return: True if the process was successful and False if something failed

        """
        try:
            URL_PREFIX = "https://github.com/adobe/tripod/raw/master/data/trained/"
            model_prefix = os.path.join(self._model_store, model_name)
            must_download = force_update or not os.path.exists(
                model_prefix + '.best') or not os.path.exists(model_prefix +
                                                              '.encodings')
            model_name_suffixes = ['-aa', '-ab', '-ac', '-ad']
            if must_download:
                # download file parts
                for model_name_suffix in model_name_suffixes:
                    url = "{0}{1}.zip{2}".format(URL_PREFIX, model_name,
                                                 model_name_suffix)
                    print(url)
                    download_target = model_prefix + '.zip' + model_name_suffix
                    self._download_with_progress_bar(url, download_target)
                    sys.stdout.write('\n')

                # concatenate zip
                download_target = model_prefix + '.zip'
                f_out = open(download_target, 'wb')
                for model_name_suffix in model_name_suffixes:
                    download_part = model_prefix + '.zip' + model_name_suffix
                    f_in = open(download_part, 'rb')
                    f_out.write(f_in.read())
                    f_in.close()
                f_out.close()
                zipfile = ZipFile(download_target, "r")
                zipfile.extractall(self._model_store)
                zipfile.close()
                sys.stdout.write("\nModel extracted successfully.")
                sys.stdout.flush()
            if os.path.exists(model_prefix + '.bpe'):
                self._bpe = BPEEncoder.load(model_prefix + '.bpe')

            self._encodings = Encodings()
            self._encodings.load(model_prefix + '.encodings')
            self._model = TripodModel2(self._encodings)
            self._model.load(model_prefix + '.best')
            self._model.to(self._device)
            self._model.eval()
            self._loaded = True
            return True
        except:
            return False
def test_mixed_encoder_word_in_other_word():
    """ Ensure that a word is correctly decoded when it contains another word """
    encoder = Encoder(vocab_size=1000, pct_bpe=0.98, ngram_max=4)
    encoder.fit(test_corpus)
    text = 'imimportport this yield toolz'
    idxs = list(encoder.transform([text]))
    idxs[0][1] = encoder.word_vocab['import']
    rebuilt = next(encoder.inverse_transform(idxs))
    assert rebuilt == 'import' + text[1:]
def test_common_byte_pair_collisions():
    """ Ensure common byte pairs like "as" don't pull from word vocab when they are subword """
    encoder = Encoder(vocab_size=200, pct_bpe=0.9, ngram_max=2)
    encoder.fit(test_corpus + ["as"] * 10)
    word = next(encoder.transform(["8 miles as the crow flies."]))
    assert encoder.bpe_vocab["as"] not in word
    assert encoder.word_vocab["as"] in word
    subword = next(encoder.transform(["Basted turkey legs."]))
    assert encoder.word_vocab["as"] not in subword
    assert encoder.bpe_vocab["as"] in subword
def test_fixed_length_encoding():
    encoder = Encoder(pct_bpe=1, required_tokens=[PAD])
    encoder.fit(test_corpus)

    result = list(encoder.transform([''], fixed_length=10))
    assert len(result) == 1
    assert len(result[0]) == 10

    result = list(encoder.transform(['', 'import ' * 50], fixed_length=10))
    assert len(result) == 2
    assert len(result[0]) == 10
    assert len(result[1]) == 10
def test_dump_and_load():
    """ Should be able to dump encoder to dict, then load it again. """
    encoder = Encoder(pct_bpe=1, ngram_max=4)
    encoder.fit(test_corpus)
    assert encoder.tokenize('from toolz import reduce') == [
        SOW, 'f', 'ro', 'm', EOW, SOW, 'tool', 'z', EOW, SOW, 'impo', 'rt',
        EOW, SOW, 'redu', 'ce', EOW
    ]

    encoder_d = encoder.vocabs_to_dict()
    new_encoder = Encoder.from_dict(encoder_d)

    assert new_encoder.tokenize('from toolz import reduce') == [
        SOW, 'f', 'ro', 'm', EOW, SOW, 'tool', 'z', EOW, SOW, 'impo', 'rt',
        EOW, SOW, 'redu', 'ce', EOW
    ]
示例#13
0
def test_inverse_transform():
    encoder = Encoder(pct_bpe=1)
    encoder.fit(test_corpus)

    transform = lambda text: next(encoder.inverse_transform(encoder.transform([text])))

    assert transform('this is how we do it') == 'this is how we do it'

    assert transform('looking at the promotional stuff, it looks good.') == \
        'looking at the promotional stuff {} it looks good .'.format(UNK)

    assert transform('almost nothing should be recognized! let\'s see...') == \
        'almost nothing should be recognized {unk} let {unk} s see ...'.format(unk=UNK)

    assert transform("Vizzini: He didn't fall? INCONCEIVABLE!") == \
        "vizzini {unk} he didn {unk} t fall {unk} inconceivable {unk}".format(unk=UNK)
def test_subword_tokenize():
    encoder = Encoder(pct_bpe=1)
    encoder.fit(test_corpus)
    assert list(encoder.subword_tokenize('this')) == [SOW, 'th', 'is', EOW]
def test_required_tokens():
    """ Should be able to require tokens to be present in encoder """
    encoder = Encoder(required_tokens=['cats', 'dogs'])
    encoder.fit(test_corpus)
    assert 'cats' in encoder.word_vocab
    assert 'dogs' in encoder.word_vocab
def test_single_letter_tokenizing():
    """ Should yield single letters when untrained """
    encoder = Encoder()
    assert encoder.tokenize('single letters') == \
        [SOW] + [UNK] * len('single') + [EOW, SOW] + [UNK] * len('letters') + [EOW]
示例#17
0
def encoder_for_lines(lines):
    """ Calculate BPE encoder for provided lines of text """
    encoder = Encoder(vocab_size=VOCAB_SIZE, required_tokens=[START])
    encoder.fit(lines)
    encoder.save('latest_encoder.json')
    return encoder
def test_encoder_creation(vocab_size):
    """ Should be able to instantiate an Encoder with expected params """
    Encoder(vocab_size=vocab_size)
def test_tokenize():
    encoder = Encoder(pct_bpe=1)
    encoder.fit(test_corpus)
    assert list(encoder.tokenize('this is how')) == [
        SOW, 'th', 'is', EOW, SOW, 'is', EOW, SOW, 'ho', 'w', EOW
    ]
def test_basic_transform():
    encoder = Encoder(pct_bpe=1)
    encoder.fit(test_corpus)
    assert len(list(encoder.transform(['this']))[0]) == 4
def test_encoder_learning_from_random_sentences(sentences):
    encoder = Encoder()
    encoder.fit(test_corpus)
    encoded = encoder.transform(sentences)