예제 #1
0
    def __init__(self, data_file: str, cutoff: int):

        super().__init__(data_file=data_file)
        self.cutoff = cutoff

        self.tokenizer = CustomTokenizer()
        df = pd.read_csv(data_file)

        target_vocab = Vocabulary(add_unk=False)
        for category in sorted(set(df.category)):
            target_vocab.add_token(category)

        word_counts = Counter()
        max_title = 0
        for title in df.title:
            tokens = self.tokenizer.tokenize(text=title)
            max_title = max(max_title, len(tokens))
            for token in tokens:
                if token not in string.punctuation:
                    word_counts[token] += 1

        data_vocab = SequenceVocabulary()
        for word, word_count in word_counts.items():
            if word_count >= self.cutoff:
                data_vocab.add_token(word)

        self.data_vocab = data_vocab
        self.target_vocab = target_vocab
        self.max_title = max_title + 2
예제 #2
0
    def __init__(self, data_file: str):
        super().__init__(data_file=data_file)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        df = pd.read_csv(data_file)
        target_vocab = Vocabulary(add_unk=False)
        for category in sorted(set(df.category)):
            target_vocab.add_token(category)
        self.target_vocab = target_vocab
예제 #3
0
class BertVectorizer(Vectorizer):
    def __init__(self, data_file: str, bert_version: str):
        super().__init__(data_file=data_file)
        self.tokenizer = BertTokenizer.from_pretrained(bert_version)
        df = pd.read_csv(data_file)
        self.target_vocab = Vocabulary(add_unk=False)
        self.target_vocab.add_many(set(df.category))

    def vectorize(self, title: str, max_seq_length: int) -> Tuple[np.array, np.array, np.array]:
        tokens = ["[CLS]"] + self.tokenizer.tokenize(title) + ["[SEP]"]
        token_type_ids, input_ids = [0] * len(tokens), self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask, padding = [1] * len(input_ids), [0] * (max_seq_length - len(input_ids))
        input_ids, attention_mask, token_type_ids = [x + padding for x in [input_ids, attention_mask, token_type_ids]]
        return np.array(input_ids), np.array(attention_mask), np.array(token_type_ids)
예제 #4
0
    def __init__(self, data_file: str):
        super().__init__(data_file=data_file)
        self.tokenizer = CharacterTokenizer()
        df = pd.read_csv(data_file)

        data_vocab = SequenceVocabulary()
        target_vocab = Vocabulary(add_unk=False)

        max_surname = 0
        for index, row in df.iterrows():
            tokens = self.tokenizer.tokenize(row.surname)
            max_surname = max(max_surname, len(tokens))
            data_vocab.add_many(tokens=tokens)
            target_vocab.add_token(row.nationality)

        self.data_vocab = data_vocab
        self.target_vocab = target_vocab
        self._max_surname = max_surname + 2
예제 #5
0
    def __init__(self, data_file: str):

        super().__init__(data_file=data_file)
        self.tokenizer = CharacterTokenizer()

        df = pd.read_csv(data_file)
        data_vocab = Vocabulary(unk_token='@')
        target_vocab = Vocabulary(add_unk=False)

        # Add surnames and nationalities to vocabulary
        for index, row in df.iterrows():
            surname = row.surname
            nationality = row.nationality
            data_vocab.add_many(tokens=self.tokenizer.tokenize(text=surname))
            target_vocab.add_token(token=nationality)

        self.data_vocab = data_vocab
        self.target_vocab = target_vocab
예제 #6
0
    def test_vocabulary(self):
        voc = Vocabulary()
        self.assertEqual(first=voc.to_serializable(),
                         second={
                             'token2id': {
                                 '<UNK>': 0
                             },
                             'add_unk': True,
                             'unk_token': '<UNK>'
                         })

        voc = Vocabulary()
        token = 'Feedly'
        voc.add_token(token=token)
        self.assertEqual(first=voc.lookup_token(token=token), second=1)
        self.assertEqual(first=voc.lookup_index(index=1), second=token)

        voc = Vocabulary()
        tokens = ['Feedly', 'NLP']
        voc.add_many(tokens=tokens)
        self.assertEqual(first=voc.lookup_token(token='Feedly'), second=1)
        self.assertEqual(first=voc.lookup_index(index=1), second='Feedly')
        self.assertEqual(first=voc.lookup_token(token='NLP'), second=2)
        self.assertEqual(first=voc.lookup_index(index=2), second='NLP')
        self.assertEqual(first=voc.to_serializable(),
                         second={
                             'token2id': {
                                 '<UNK>': 0,
                                 'Feedly': 1,
                                 'NLP': 2
                             },
                             'add_unk': True,
                             'unk_token': '<UNK>'
                         })

        voc = Vocabulary.from_serializable(
            contents={
                'token2id': {
                    '<UNK>': 0,
                    'Feedly': 1,
                    'NLP': 2
                },
                'add_unk': True,
                'unk_token': '<UNK>'
            })
        self.assertEqual(first=voc.to_serializable(),
                         second={
                             'token2id': {
                                 '<UNK>': 0,
                                 'Feedly': 1,
                                 'NLP': 2
                             },
                             'add_unk': True,
                             'unk_token': '<UNK>'
                         })

        self.assertEqual(first=len(voc), second=3)
예제 #7
0
 def __init__(self, data_file: str, bert_version: str):
     super().__init__(data_file=data_file)
     self.tokenizer = BertTokenizer.from_pretrained(bert_version)
     df = pd.read_csv(data_file)
     self.target_vocab = Vocabulary(add_unk=False)
     self.target_vocab.add_many(set(df.category))