class ComputeEmbeddingsTest(unittest.TestCase):
    def setUp(self):
        self.model = SentenceTransformer('paraphrase-distilroberta-base-v1')

    def test_encode_single_sentences(self):
        #Single sentence
        emb = self.model.encode("Hello Word, a test sentence")
        assert emb.shape == (768, )
        assert abs(np.sum(emb) - 7.9811716) < 0.001

        # Single sentence as list
        emb = self.model.encode(["Hello Word, a test sentence"])
        assert emb.shape == (1, 768)
        assert abs(np.sum(emb) - 7.9811716) < 0.001

        # Sentence list
        emb = self.model.encode([
            "Hello Word, a test sentence", "Here comes another sentence",
            "My final sentence"
        ])
        assert emb.shape == (3, 768)
        print(np.sum(emb))
        assert abs(np.sum(emb) - 22.968266) < 0.001

    def test_encode_tuple_sentences(self):
        # Input a sentence tuple
        emb = self.model.encode([("Hello Word, a test sentence",
                                  "Second input for model")])
        assert emb.shape == (1, 768)
        assert abs(np.sum(emb) - 9.503508) < 0.001

        # List of sentence tuples
        emb = self.model.encode([("Hello Word, a test sentence",
                                  "Second input for model"),
                                 ("My second tuple", "With two inputs"),
                                 ("Final tuple", "final test")])
        assert emb.shape == (3, 768)
        assert abs(np.sum(emb) - 32.14627) < 0.001

    def test_multi_gpu_encode(self):
        # Start the multi-process pool on all available CUDA devices
        pool = self.model.start_multi_process_pool(['cpu', 'cpu'])

        sentences = ["This is sentence {}".format(i) for i in range(1000)]

        # Compute the embeddings using the multi-process pool
        emb = self.model.encode_multi_process(sentences, pool, chunk_size=50)
        assert emb.shape == (1000, 768)

        emb_normal = self.model.encode(sentences)
        diff = np.sum(np.abs(emb - emb_normal))
        assert diff < 0.001
Exemplo n.º 2
0
class ComputeMultiProcessTest(unittest.TestCase):
    def setUp(self):
        self.model = SentenceTransformer('paraphrase-distilroberta-base-v1')

    def test_multi_gpu_encode(self):
        # Start the multi-process pool on all available CUDA devices
        pool = self.model.start_multi_process_pool(['cpu', 'cpu'])

        sentences = ["This is sentence {}".format(i) for i in range(1000)]

        # Compute the embeddings using the multi-process pool
        emb = self.model.encode_multi_process(sentences, pool, chunk_size=50)
        assert emb.shape == (len(sentences), 768)

        emb_normal = self.model.encode(sentences)

        diff = np.max(np.abs(emb - emb_normal))
        print("Max multi proc diff", diff)
        assert diff < 0.001
sentences in parallel. This gives a near linear speed-up
when encoding large text collections.
"""

from sentence_transformers import SentenceTransformer, LoggingHandler
import logging

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

#Important, you need to shield your code with if __name__. Otherwise, CUDA runs into issues when spawning new processes.
if __name__ == '__main__':

    #Create a large list of 100k sentences
    sentences = ["This is sentence {}".format(i) for i in range(100000)]

    #Define the model
    model = SentenceTransformer('paraphrase-distilroberta-base-v1')

    #Start the multi-process pool on all available CUDA devices
    pool = model.start_multi_process_pool()

    #Compute the embeddings using the multi-process pool
    emb = model.encode_multi_process(sentences, pool)
    print("Embeddings computed. Shape:", emb.shape)

    #Optional: Stop the proccesses in the pool
    model.stop_multi_process_pool(pool)
class ComputeEmbeddingsTest(unittest.TestCase):
    def setUp(self):
        self.model = SentenceTransformer('paraphrase-distilroberta-base-v1')


    def test_encode_token_embeddings(self):
        """
        Test that encode(output_value='token_embeddings') works
        :return:
        """
        sent = ["Hello Word, a test sentence", "Here comes another sentence", "My final sentence", "Sentences", "Sentence five five five five five five five"]
        emb = self.model.encode(sent, output_value='token_embeddings', batch_size=2)
        assert len(emb) == len(sent)
        for s, e in zip(sent, emb):
            assert len(self.model.tokenize([s])['input_ids'][0]) == e.shape[0]


    def test_encode_single_sentences(self):
        #Single sentence
        emb = self.model.encode("Hello Word, a test sentence")
        assert emb.shape == (768,)
        assert abs(np.sum(emb) - 7.9811716) < 0.001

        # Single sentence as list
        emb = self.model.encode(["Hello Word, a test sentence"])
        assert emb.shape == (1, 768)
        assert abs(np.sum(emb) - 7.9811716) < 0.001

        # Sentence list
        emb = self.model.encode(["Hello Word, a test sentence", "Here comes another sentence", "My final sentence"])
        assert emb.shape == (3, 768)
        assert abs(np.sum(emb) - 22.968266) < 0.001

    def test_encode_normalize(self):
        emb = self.model.encode(["Hello Word, a test sentence", "Here comes another sentence", "My final sentence"], normalize_embeddings=True)
        assert emb.shape == (3, 768)
        for norm in np.linalg.norm(emb, axis=1):
            assert abs(norm - 1) < 0.001

    def test_encode_tuple_sentences(self):
        # Input a sentence tuple
        emb = self.model.encode([("Hello Word, a test sentence", "Second input for model")])
        assert emb.shape == (1, 768)
        assert abs(np.sum(emb) - 9.503508) < 0.001

        # List of sentence tuples
        emb = self.model.encode([("Hello Word, a test sentence", "Second input for model"), ("My second tuple", "With two inputs"), ("Final tuple", "final test")])
        assert emb.shape == (3, 768)
        assert abs(np.sum(emb) - 32.14627) < 0.001

    def test_multi_gpu_encode(self):
        # Start the multi-process pool on all available CUDA devices
        pool = self.model.start_multi_process_pool(['cpu', 'cpu'])

        sentences = ["This is sentence {}".format(i) for i in range(1000)]

        # Compute the embeddings using the multi-process pool
        emb = self.model.encode_multi_process(sentences, pool, chunk_size=50)
        assert emb.shape == (1000, 768)

        emb_normal = self.model.encode(sentences)
        diff = np.sum(np.abs(emb - emb_normal))
        assert diff < 0.001