Python CharVectorizer 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: keras_nlp.preprocessing.text

클래스/타입: CharVectorizer

hotexamples.com에서의 예제들: 10

Python CharVectorizer - 10개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 keras_nlp.preprocessing.text.CharVectorizer에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

fit_on_texts(7)

CharVectorizer(4)

texts_to_vectors(4)

_apply_filters(2)

vectors_to_texts(2)

stats(1)

예제 #1

파일 보기

class TestCharVectorizerWithCharactersSet(TestCase):
    def setUp(self) -> None:
        self.characters = [c for c in 'aeiouy']
        self.vectorizer = CharVectorizer(characters=self.characters)

    def test_fit_on_texts(self):
        self.vectorizer.fit_on_texts(DOCS)
        self.assertEqual(len(self.vectorizer.token2id),
                         len(self.characters) + 1)  # Add PAD.

예제 #2

파일 보기

class TestCharVectorizerWithNumChars(TestCase):
    def setUp(self) -> None:
        self.num_chars = 20
        self.vectorizer = CharVectorizer(num_chars=self.num_chars)

    def test_fit_on_texts(self):
        self.vectorizer.fit_on_texts(DOCS)
        self.assertEqual(len(self.vectorizer.token2id),
                         self.num_chars + 1)  # Add PAD.

예제 #3

파일 보기

class TestCharVectorizerExceptions(TestCase):
    def setUp(self) -> None:
        self.vectorizer = CharVectorizer()
        self.vectorizer.fit_on_texts(DOCS)

    def test_text_to_sequences_truncating_exception(self):
        self.assertRaises(
            ValueError, self.vectorizer.texts_to_vectors, DOCS,
            (SMALL_MAX_SENTENCES, SMALL_MAX_WORDS, SMALL_MAX_CHARACTERS),
            'pre', 'middle')

    def test_text__to_sequences_padding_exception(self):
        self.assertRaises(
            ValueError, self.vectorizer.texts_to_vectors, DOCS,
            (SMALL_MAX_SENTENCES, SMALL_MAX_WORDS, SMALL_MAX_CHARACTERS),
            'middle', 'post')

예제 #4

파일 보기

class TestCharVectorizerWithLargeValues(TestCase):
    def setUp(self) -> None:
        self.vectorizer = CharVectorizer()
        self.vectorizer.fit_on_texts(DOCS)

    def test_texts_to_sequences_with_padding_post(self):
        vectors = self.vectorizer.texts_to_vectors(
            DOCS,
            shape=(LARGE_MAX_WORDS, LARGE_MAX_CHARACTERS),
            padding='post')
        # Length of 1st word of DOC0.
        word_len = len(DOC0.split()[0])
        # Number of zero values in the vector.
        num_zeros = LARGE_MAX_CHARACTERS - word_len
        expected_post = [0] * num_zeros
        # Get the last (post) len(num_zeros) values of the DOC0, word0
        post = vectors[0][0][-num_zeros:].tolist()
        self.assertListEqual(expected_post, post)

    def test_texts_to_sequences_with_padding_pre(self):
        self.vectorizer.fit_on_texts(DOCS)
        vectors = self.vectorizer.texts_to_vectors(
            DOCS, shape=(LARGE_MAX_WORDS, LARGE_MAX_CHARACTERS), padding='pre')
        # Length of 1st word of DOC0.
        word_len = len(DOC0.split()[0])
        # Number of zero values in the vector.
        num_zeros = LARGE_MAX_CHARACTERS - word_len
        expected_post = [0] * num_zeros
        # Get the first (pre) len(num_zeros) values of the DOC0, word0
        post = vectors[0][0][:num_zeros].tolist()
        self.assertListEqual(expected_post, post)

예제 #5

파일 보기

 def test_vectors_to_text_truncating_offsets(self):
     vectorizer = CharVectorizer()
     doc = open(path.join(path.dirname(__file__), 'lorem_ipsum.txt')).read()
     doc_sents = sent_tokenize(doc)
     vectorizer.fit_on_texts(doc_sents)
     doc_sents = [vectorizer._apply_filters(s) for s in doc_sents]
     words_len = [len(sent.split()) for sent in doc_sents]
     chars_len = [len(word) for sent in doc_sents for word in sent.split()]
     avg_words = int(sum(words_len) / len(words_len))
     avg_chars = int(sum(chars_len) / len(chars_len))
     target_shape = (avg_words, avg_chars)
     truncating_shape = (0.5, 0.5)
     vectors = vectorizer.texts_to_vectors([doc],
                                           shape=target_shape,
                                           truncating=truncating_shape)
     # Don't consider the number of texts.
     self.assertTupleEqual(vectors.shape[1:], target_shape)

예제 #6

파일 보기

class TestCharVectorizerWithDefaultValues(TestCase):
    def setUp(self) -> None:
        self.vectorizer = CharVectorizer()
        self.vectorizer.fit_on_texts(DOCS)

    def test_fit_on_texts(self):
        self.assertEqual(len(self.vectorizer.token2id), 36)

    def test_texts_to_vectors(self):
        vectors = self.vectorizer.texts_to_vectors(DOCS)
        docs_stats, words_stats = self.vectorizer.stats()
        shape = (len(DOCS), docs_stats['max'], words_stats['max'])
        self.assertEqual(vectors.shape, shape)

    def test_vectors_to_texts(self):
        vectors = self.vectorizer.texts_to_vectors(DOCS)
        docs_chars = self.vectorizer.vectors_to_texts(vectors)
        text1 = self.vectorizer._apply_filters(DOC0)
        doc1_chars = [[c for c in word] for word in text1.split()]
        self.assertListEqual(docs_chars[0], doc1_chars)

        docs_words = self.vectorizer.vectors_to_texts(vectors, as_words=True)
        doc1_words = [word for word in text1.split()]
        self.assertListEqual(docs_words[0], doc1_words)

예제 #7

파일 보기

class TestCharVectorizerWithSmallValues(TestCase):
    def setUp(self) -> None:
        self.vectorizer = CharVectorizer()
        self.vectorizer.fit_on_texts(DOCS)

    def test_vectors_to_texts_with_truncating_pre(self):
        vectors = self.vectorizer.texts_to_vectors(
            DOCS,
            shape=(SMALL_MAX_WORDS, SMALL_MAX_CHARACTERS),
            truncating='pre')
        docs = self.vectorizer.vectors_to_texts(vectors)
        self.assertListEqual(docs[0], DOC0_SWC_TPRE)
        self.assertListEqual(docs[1], DOC1_SWC_TPRE)

    def test_vectors_to_texts_with_truncating_post(self):
        vectors = self.vectorizer.texts_to_vectors(
            DOCS,
            shape=(SMALL_MAX_WORDS, SMALL_MAX_CHARACTERS),
            truncating='post')
        docs = self.vectorizer.vectors_to_texts(vectors)
        self.assertListEqual(docs[0], DOC0_SWC_TPOST)
        self.assertListEqual(docs[1], DOC1_SWC_TPOST)

    def test_vectors_to_text_truncating_offsets(self):
        vectorizer = CharVectorizer()
        doc = open(path.join(path.dirname(__file__), 'lorem_ipsum.txt')).read()
        doc_sents = sent_tokenize(doc)
        vectorizer.fit_on_texts(doc_sents)
        doc_sents = [vectorizer._apply_filters(s) for s in doc_sents]
        words_len = [len(sent.split()) for sent in doc_sents]
        chars_len = [len(word) for sent in doc_sents for word in sent.split()]
        avg_words = int(sum(words_len) / len(words_len))
        avg_chars = int(sum(chars_len) / len(chars_len))
        target_shape = (avg_words, avg_chars)
        truncating_shape = (0.5, 0.5)
        vectors = vectorizer.texts_to_vectors([doc],
                                              shape=target_shape,
                                              truncating=truncating_shape)
        # Don't consider the number of texts.
        self.assertTupleEqual(vectors.shape[1:], target_shape)

예제 #8

파일 보기

 def setUp(self) -> None:
     self.vectorizer = CharVectorizer()
     self.vectorizer.fit_on_texts(DOCS)

예제 #9

파일 보기

 def setUp(self) -> None:
     self.characters = [c for c in 'aeiouy']
     self.vectorizer = CharVectorizer(characters=self.characters)

예제 #10

파일 보기

 def setUp(self) -> None:
     self.num_chars = 20
     self.vectorizer = CharVectorizer(num_chars=self.num_chars)