Python Vocabulary 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: transfer_nlp.loaders.vocabulary

클래스/타입: Vocabulary

hotexamples.com에서의 예제들: 7

Python Vocabulary - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 transfer_nlp.loaders.vocabulary.Vocabulary에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Vocabulary(6)

add_token(5)

add_many(3)

from_serializable(1)

lookup_index(1)

lookup_token(1)

to_serializable(1)

예제 #1

파일 보기

    def __init__(self, data_file: str, cutoff: int):

        super().__init__(data_file=data_file)
        self.cutoff = cutoff

        self.tokenizer = CustomTokenizer()
        df = pd.read_csv(data_file)

        target_vocab = Vocabulary(add_unk=False)
        for category in sorted(set(df.category)):
            target_vocab.add_token(category)

        word_counts = Counter()
        max_title = 0
        for title in df.title:
            tokens = self.tokenizer.tokenize(text=title)
            max_title = max(max_title, len(tokens))
            for token in tokens:
                if token not in string.punctuation:
                    word_counts[token] += 1

        data_vocab = SequenceVocabulary()
        for word, word_count in word_counts.items():
            if word_count >= self.cutoff:
                data_vocab.add_token(word)

        self.data_vocab = data_vocab
        self.target_vocab = target_vocab
        self.max_title = max_title + 2

예제 #2

파일 보기

파일: bert.py 프로젝트: yushu-liu/transfer-nlp

    def __init__(self, data_file: str):
        super().__init__(data_file=data_file)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        df = pd.read_csv(data_file)
        target_vocab = Vocabulary(add_unk=False)
        for category in sorted(set(df.category)):
            target_vocab.add_token(category)
        self.target_vocab = target_vocab

예제 #3

파일 보기

파일: bert.py 프로젝트: moolighty/transfer-nlp

class BertVectorizer(Vectorizer):
    def __init__(self, data_file: str, bert_version: str):
        super().__init__(data_file=data_file)
        self.tokenizer = BertTokenizer.from_pretrained(bert_version)
        df = pd.read_csv(data_file)
        self.target_vocab = Vocabulary(add_unk=False)
        self.target_vocab.add_many(set(df.category))

    def vectorize(self, title: str, max_seq_length: int) -> Tuple[np.array, np.array, np.array]:
        tokens = ["[CLS]"] + self.tokenizer.tokenize(title) + ["[SEP]"]
        token_type_ids, input_ids = [0] * len(tokens), self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask, padding = [1] * len(input_ids), [0] * (max_seq_length - len(input_ids))
        input_ids, attention_mask, token_type_ids = [x + padding for x in [input_ids, attention_mask, token_type_ids]]
        return np.array(input_ids), np.array(attention_mask), np.array(token_type_ids)

예제 #4

파일 보기

파일: surnames.py 프로젝트: yushu-liu/transfer-nlp

    def __init__(self, data_file: str):
        super().__init__(data_file=data_file)
        self.tokenizer = CharacterTokenizer()
        df = pd.read_csv(data_file)

        data_vocab = SequenceVocabulary()
        target_vocab = Vocabulary(add_unk=False)

        max_surname = 0
        for index, row in df.iterrows():
            tokens = self.tokenizer.tokenize(row.surname)
            max_surname = max(max_surname, len(tokens))
            data_vocab.add_many(tokens=tokens)
            target_vocab.add_token(row.nationality)

        self.data_vocab = data_vocab
        self.target_vocab = target_vocab
        self._max_surname = max_surname + 2

예제 #5

파일 보기

파일: surnames.py 프로젝트: yushu-liu/transfer-nlp

    def __init__(self, data_file: str):

        super().__init__(data_file=data_file)
        self.tokenizer = CharacterTokenizer()

        df = pd.read_csv(data_file)
        data_vocab = Vocabulary(unk_token='@')
        target_vocab = Vocabulary(add_unk=False)

        # Add surnames and nationalities to vocabulary
        for index, row in df.iterrows():
            surname = row.surname
            nationality = row.nationality
            data_vocab.add_many(tokens=self.tokenizer.tokenize(text=surname))
            target_vocab.add_token(token=nationality)

        self.data_vocab = data_vocab
        self.target_vocab = target_vocab

예제 #6

파일 보기

파일: test_vocabulary.py 프로젝트: yushu-liu/transfer-nlp

    def test_vocabulary(self):
        voc = Vocabulary()
        self.assertEqual(first=voc.to_serializable(),
                         second={
                             'token2id': {
                                 '<UNK>': 0
                             },
                             'add_unk': True,
                             'unk_token': '<UNK>'
                         })

        voc = Vocabulary()
        token = 'Feedly'
        voc.add_token(token=token)
        self.assertEqual(first=voc.lookup_token(token=token), second=1)
        self.assertEqual(first=voc.lookup_index(index=1), second=token)

        voc = Vocabulary()
        tokens = ['Feedly', 'NLP']
        voc.add_many(tokens=tokens)
        self.assertEqual(first=voc.lookup_token(token='Feedly'), second=1)
        self.assertEqual(first=voc.lookup_index(index=1), second='Feedly')
        self.assertEqual(first=voc.lookup_token(token='NLP'), second=2)
        self.assertEqual(first=voc.lookup_index(index=2), second='NLP')
        self.assertEqual(first=voc.to_serializable(),
                         second={
                             'token2id': {
                                 '<UNK>': 0,
                                 'Feedly': 1,
                                 'NLP': 2
                             },
                             'add_unk': True,
                             'unk_token': '<UNK>'
                         })

        voc = Vocabulary.from_serializable(
            contents={
                'token2id': {
                    '<UNK>': 0,
                    'Feedly': 1,
                    'NLP': 2
                },
                'add_unk': True,
                'unk_token': '<UNK>'
            })
        self.assertEqual(first=voc.to_serializable(),
                         second={
                             'token2id': {
                                 '<UNK>': 0,
                                 'Feedly': 1,
                                 'NLP': 2
                             },
                             'add_unk': True,
                             'unk_token': '<UNK>'
                         })

        self.assertEqual(first=len(voc), second=3)

예제 #7

파일 보기

파일: bert.py 프로젝트: moolighty/transfer-nlp

 def __init__(self, data_file: str, bert_version: str):
     super().__init__(data_file=data_file)
     self.tokenizer = BertTokenizer.from_pretrained(bert_version)
     df = pd.read_csv(data_file)
     self.target_vocab = Vocabulary(add_unk=False)
     self.target_vocab.add_many(set(df.category))