예제 #1
0
def _get_scorer_and_corpus_eos():
    ctxs = [mx.cpu()]
    model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-en-uncased')
    scorer_mx = MLMScorer(model, vocab, tokenizer, ctxs, eos=True, wwm=False)
    model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-uncased')
    scorer_pt = MLMScorerPT(model, vocab, tokenizer, ctxs, eos=True, wwm=False)
    corpus = Corpus.from_dict({'utt': {'ref': "I am Sam"}})
    return scorer_mx, scorer_pt, corpus
예제 #2
0
def test_get_pretrained():

    # bert-base-en-uncased

    model, vocab, tokenizer = get_pretrained([mx.cpu()], 'bert-base-en-uncased')
    # Check the model
    assert isinstance(model, nlp.model.BERTModel)
    assert len(model.encoder.transformer_cells) == 12
    assert pytest.approx(model.word_embed[0].params['bertmodel0_word_embed_embedding0_weight']._data[0][0,0].asscalar()) == -0.0424806065
    # Check the vocab
    unk_idx = vocab.token_to_idx[vocab.unknown_token]
    assert vocab.token_to_idx['test'] != unk_idx
    assert vocab.token_to_idx['Test'] == unk_idx
    # Check the tokenizer
    assert tuple(tokenizer("The man jumped up, put his basket on Philammon's head")) == ('the', 'man', 'jumped', 'up', ',', 'put', 'his', 'basket', 'on', 'phil', '##am', '##mon', "'", 's', 'head')

    # bert-base-en-uncased-owt

    model, vocab_new, tokenizer = get_pretrained([mx.cpu()], 'bert-base-en-uncased-owt')
    # Check the model
    assert pytest.approx(model.word_embed[0].params['bertmodel1_word_embed_embedding0_weight']._data[0][0,0].asscalar()) == -0.0361938476
    # Check the vocab
    assert len(vocab_new) == len(vocab)
    # Check the tokenizer
    assert tuple(tokenizer("The man jumped up, put his basket on Philammon's head")) == ('the', 'man', 'jumped', 'up', ',', 'put', 'his', 'basket', 'on', 'phil', '##am', '##mon', "'", 's', 'head')

    # bert-large-en-cased

    model, vocab, tokenizer = get_pretrained([mx.cpu()], 'bert-large-en-cased')
    # Check the model
    assert isinstance(model, nlp.model.BERTModel)
    assert len(model.encoder.transformer_cells) == 24
    assert pytest.approx(model.word_embed[0].params['bertmodel2_word_embed_embedding0_weight']._data[0][0,0].asscalar()) == 0.0116166482
    # Check the vocab
    unk_idx = vocab.token_to_idx[vocab.unknown_token]
    assert vocab.token_to_idx['test'] != unk_idx
    assert vocab.token_to_idx['Test'] != unk_idx
    assert vocab.token_to_idx['Test'] != vocab.token_to_idx['test']
    # Check the tokenizer
    assert tuple(tokenizer("The man jumped up, put his basket on Philammon's head")) == ('The', 'man', 'jumped', 'up', ',', 'put', 'his', 'basket', 'on', 'Phil', '##am', '##mon', "'", 's', 'head')

    # bert-base-multi-cased

    model, vocab, tokenizer = get_pretrained([mx.cpu()], 'bert-base-multi-cased')
    # Check the model
    assert isinstance(model, nlp.model.BERTModel)
    assert len(model.encoder.transformer_cells) == 12
    assert pytest.approx(model.word_embed[0].params['bertmodel3_word_embed_embedding0_weight']._data[0][0,0].asscalar()) == 0.0518957935
    # Check the vocab
    unk_idx = vocab.token_to_idx[vocab.unknown_token]
    assert vocab.token_to_idx['Test'] != unk_idx
    assert vocab.token_to_idx['これは'] != unk_idx
    # Check the tokenizer
    assert tuple(tokenizer("これは Test ですよ。")) == ('これは', 'Test', 'で', '##す', '##よ', '。')
예제 #3
0
def test_mlmscorer_score_sentences():

    TEST_CASES = (
        # README examples
        ('bert-base-en-cased', MLMScorer,
         (None, -6.126666069030762, -5.50140380859375, -0.7823182344436646,
          None)),
        ('bert-base-cased', MLMScorerPT,
         (None, -6.126738548278809, -5.501765727996826, -0.782496988773346,
          None)),
        ('gpt2-117m-en-cased', LMScorer,
         (-8.293947219848633, -6.387561798095703, -1.3138668537139893)),
        # etc.
        ('albert-base-v2', MLMScorerPT,
         (None, -16.480087280273438, -12.897505760192871, -4.277405738830566,
          None)),
        ('distilbert-base-cased', MLMScorerPT,
         (None, -5.1874895095825195, -6.390861511230469, -3.8225560188293457,
          None)),
    )

    for name, scorer_cls, expected_scores in TEST_CASES:
        model, vocab, tokenizer = get_pretrained([mx.cpu()], name)
        scorer = scorer_cls(model, vocab, tokenizer, [mx.cpu()])
        scores = scorer.score_sentences(["Hello world!"], per_token=True)[0]
        expected_total = 0
        for score, expected_score in zip(scores, expected_scores):
            if score is None and expected_score is None:
                continue
            assert pytest.approx(score, abs=0.0001) == expected_score
            expected_total += expected_score
        score_total = scorer.score_sentences(["Hello world!"],
                                             per_token=False)[0]
        assert pytest.approx(score_total, abs=0.0001) == expected_total
예제 #4
0
class Server(BaseHTTPRequestHandler):
    ctxs = [mx.gpu()]
    model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-en-cased')
    scorer = MLMScorer(model, vocab, tokenizer, ctxs)

    def _set_headers(self, content_type):
        self.send_response(200)
        self.send_header('Content-type', content_type)
        self.end_headers()

    @staticmethod
    def _html(message):
        """This just generates an HTML document that includes `message`
        in the body. Override, or re-write this do do more interesting stuff.
        """
        content = f"<html><body><h1>{message}</h1></body></html>"
        return content.encode('utf8')  # NOTE: must return a bytes object!

    def do_GET(self):
        self._set_headers('text/html')
        self.wfile.write(self._html('hi'))

    def do_HEAD(self):
        self._set_headers('text/html')

    def do_POST(self):
        print('received request')
        ctype, pdict = cgi.parse_header(self.headers.get('content-type'))

        # refuse to receive non-json content
        if ctype != 'application/json' or self.path != '/score':
            self.send_response(400)
            self.end_headers()
            return

        # read the message and convert it into a python dictionary
        length = int(self.headers.get('content-length'))
        request = json.loads(self.rfile.read(length))
        sentences = request['texts']
        corpus = Corpus.from_text(sentences.toList())

        # Sentences are encoded by calling model.encode()
        print(f'scoring {len(sentences)} sentences')
        scores = self.scorer.score(corpus)
        print(f'done')
        response = {'id': request['id'], 'result': scores, 'status': 200}
        self._set_headers('content-type')
        self.wfile.write(json.dumps(response).encode('utf8'))
예제 #5
0
 def __init__(self, model_name_or_path, gpu_batch_size=1, gpu_id=0):
     mx_device = [mx.gpu(gpu_id)]
     self.scorer = MLMScorerPT(
         *get_pretrained(mx_device, model_name_or_path), mx_device)
     self.gpu_batch_size = gpu_batch_size
예제 #6
0
class Server(BaseHTTPRequestHandler):
    # sentence_transformers
    sbert_model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

    # gpt with pytorch_pretrained_bert
    # torch.cuda.set_device(0)
    # model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
    # model.eval()
    # tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

    # mlms scorers
    ctxs = [mx.gpu()]
    # mlms_model, vocab, tokenizer = get_pretrained(ctxs, 'roberta-base-en-cased')
    # scorer = MLMScorer(mlms_model, vocab, tokenizer, ctxs)
    # mlms_model, vocab, tokenizer = get_pretrained(ctxs, 'distilbert-base-cased')
    # scorer = MLMScorerPT(mlms_model, vocab, tokenizer, ctxs)
    mlms_model, vocab, tokenizer = get_pretrained(ctxs, 'gpt2-117m-en-cased')
    scorer = LMScorer(mlms_model, vocab, tokenizer, ctxs)

    def _set_headers(self, content_type):
        self.send_response(200)
        self.send_header('Content-type', content_type)
        self.end_headers()

    @staticmethod
    def _html(message):
        """This just generates an HTML document that includes `message`
        in the body. Override, or re-write this do do more interesting stuff.
        """
        content = f"<html><body><h1>{message}</h1></body></html>"
        return content.encode('utf8')  # NOTE: must return a bytes object!

    def do_GET(self):
        self._set_headers('text/html')
        self.wfile.write(self._html('hi'))

    def do_HEAD(self):
        self._set_headers('text/html')

    def do_POST(self):
        print('received request')
        ctype, pdict = cgi.parse_header(self.headers.get('content-type'))

        # refuse to receive non-json content
        if ctype != 'application/json':
            self.send_response(400)
            self.end_headers()
            return

        # read the message and convert it into a python dictionary
        length = int(self.headers.get('content-length'))
        request = json.loads(self.rfile.read(length))
        sentences = request['texts']

        if self.path == '/encode':
            # Encode sentences using Sentence scoring model
            print(f'encoding {len(sentences)} sentences')
            embeddings = self.sbert_model.encode(sentences)
            print(f'done')
            response = {
                'id': request['id'],
                'result': embeddings.tolist(),
                'status': 200
            }
            self._set_headers('content-type')
            self.wfile.write(json.dumps(response).encode('utf8'))
        elif self.path == '/score':
            print(f'scoring {len(sentences)} sentences')
            scores = self.model_score(sentences)
            print(f'done')
            response = {
                'id': request['id'],
                'result': [scores[0]],
                'status': 200
            }
            self._set_headers('content-type')
            self.wfile.write(json.dumps(response).encode('utf8'))

    # uses mlms scorer
    def model_score(self, sentences):
        corpus = Corpus.from_text(sentences)
        return self.scorer.score(corpus, 1.0, 50)
예제 #7
0
from mlm.scorers import MLMScorer, MLMScorerPT, LMScorer
from mlm.models import get_pretrained
import mxnet as mx
import torch
from transformers import AutoModel, AutoTokenizer
import numpy as np

ctxs = [mx.cpu()]  # or, e.g., [mx.gpu(0), mx.gpu(1)]

sentence = 'confirms HTTPURL via @USER :cry:'

print('Checking original MLM library..')
# MXNet MLMs (use names from mlm.models.SUPPORTED_MLMS)
model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-en-cased')

#print(type(vocab).__name__)
scorer = MLMScorer(model, vocab, tokenizer, ctxs)
print(scorer.score_sentences([sentence]))
# >> [-12.410664200782776]
print(scorer.score_sentences([sentence], per_token=True))
# >> [[None, -6.126736640930176, -5.501412391662598, -0.7825151681900024, None]]

print('Done. Checking extension..')
# Load the AutoTokenizer with a normalization mode if the input Tweet is raw
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base",
                                          normalization=True)

bertweet, vocab, tokenizer = get_pretrained(ctxs,
                                            'vinai/bertweet-base-en-cased')

#print(BERTVocab(tokenizer.vocab_file))
예제 #8
0
def test_get_pretrained():

    # MXNet: bert-base-en-uncased

    model, vocab, tokenizer = get_pretrained([mx.cpu()],
                                             'bert-base-en-uncased')
    # Check the model
    assert isinstance(model, nlp.model.BERTModel)
    assert len(model.encoder.transformer_cells) == 12
    unk_idx = vocab.token_to_idx[vocab.unknown_token]
    assert pytest.approx(
        model.word_embed[0].params['bertmodel0_word_embed_embedding0_weight'].
        _data[0][unk_idx, 0].asscalar()) == -0.0424806065
    # Check the vocab
    assert vocab.token_to_idx['test'] != unk_idx
    assert vocab.token_to_idx['Test'] == unk_idx
    # Check the tokenizer
    assert tuple(
        tokenizer("The man jumped up, put his basket on Philammon's head")
    ) == ('the', 'man', 'jumped', 'up', ',', 'put', 'his', 'basket', 'on',
          'phil', '##am', '##mon', "'", 's', 'head')

    # PyTorch: bert-base-en-uncased

    model, _, tokenizer = get_pretrained([mx.cpu()], 'bert-base-uncased')
    # Check the model
    assert isinstance(model, BertForMaskedLMOptimized)
    assert len(model.bert.encoder.layer) == 12
    unk_idx = tokenizer.unk_token_id
    assert pytest.approx(
        model.bert.embeddings.word_embeddings.parameters().__next__()[
            unk_idx, 0].detach().numpy().item()) == -0.0424806065
    # Check the vocab
    assert tokenizer.convert_tokens_to_ids('test') != unk_idx
    assert tokenizer.convert_tokens_to_ids('Test') == unk_idx
    # Check the tokenizer
    assert tuple(
        tokenizer.tokenize(
            "The man jumped up, put his basket on Philammon's head")) == (
                'the', 'man', 'jumped', 'up', ',', 'put', 'his', 'basket',
                'on', 'phil', '##am', '##mon', "'", 's', 'head')

    # MXNet: bert-base-en-uncased-owt

    model, vocab_new, tokenizer = get_pretrained([mx.cpu()],
                                                 'bert-base-en-uncased-owt')
    # Check the model
    assert pytest.approx(
        model.word_embed[0].params['bertmodel1_word_embed_embedding0_weight'].
        _data[0][0, 0].asscalar()) == -0.0361938476
    # Check the vocab
    assert len(vocab_new) == len(vocab)
    # Check the tokenizer
    assert tuple(
        tokenizer("The man jumped up, put his basket on Philammon's head")
    ) == ('the', 'man', 'jumped', 'up', ',', 'put', 'his', 'basket', 'on',
          'phil', '##am', '##mon', "'", 's', 'head')

    # MXNet: bert-large-en-cased

    model, vocab, tokenizer = get_pretrained([mx.cpu()], 'bert-large-en-cased')
    # Check the model
    assert isinstance(model, nlp.model.BERTModel)
    assert len(model.encoder.transformer_cells) == 24
    assert pytest.approx(
        model.word_embed[0].params['bertmodel2_word_embed_embedding0_weight'].
        _data[0][0, 0].asscalar()) == 0.0116166482
    # Check the vocab
    unk_idx = vocab.token_to_idx[vocab.unknown_token]
    assert vocab.token_to_idx['test'] != unk_idx
    assert vocab.token_to_idx['Test'] != unk_idx
    assert vocab.token_to_idx['Test'] != vocab.token_to_idx['test']
    # Check the tokenizer
    assert tuple(
        tokenizer("The man jumped up, put his basket on Philammon's head")
    ) == ('The', 'man', 'jumped', 'up', ',', 'put', 'his', 'basket', 'on',
          'Phil', '##am', '##mon', "'", 's', 'head')

    # MXNet: bert-base-multi-cased

    model, vocab, tokenizer = get_pretrained([mx.cpu()],
                                             'bert-base-multi-cased')
    # Check the model
    assert isinstance(model, nlp.model.BERTModel)
    assert len(model.encoder.transformer_cells) == 12
    assert pytest.approx(
        model.word_embed[0].params['bertmodel3_word_embed_embedding0_weight'].
        _data[0][0, 0].asscalar()) == 0.0518957935
    # Check the vocab
    unk_idx = vocab.token_to_idx[vocab.unknown_token]
    assert vocab.token_to_idx['Test'] != unk_idx
    assert vocab.token_to_idx['これは'] != unk_idx
    # Check the tokenizer
    assert tuple(tokenizer("これはTestですよ。")) == ('これは', '##T', '##est', '##で',
                                               '##す', '##よ', '。')