def __init__(self, data_file: str, cutoff: int): super().__init__(data_file=data_file) self.cutoff = cutoff self.tokenizer = CustomTokenizer() df = pd.read_csv(data_file) target_vocab = Vocabulary(add_unk=False) for category in sorted(set(df.category)): target_vocab.add_token(category) word_counts = Counter() max_title = 0 for title in df.title: tokens = self.tokenizer.tokenize(text=title) max_title = max(max_title, len(tokens)) for token in tokens: if token not in string.punctuation: word_counts[token] += 1 data_vocab = SequenceVocabulary() for word, word_count in word_counts.items(): if word_count >= self.cutoff: data_vocab.add_token(word) self.data_vocab = data_vocab self.target_vocab = target_vocab self.max_title = max_title + 2
def __init__(self, data_file: str): super().__init__(data_file=data_file) self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') df = pd.read_csv(data_file) target_vocab = Vocabulary(add_unk=False) for category in sorted(set(df.category)): target_vocab.add_token(category) self.target_vocab = target_vocab
class BertVectorizer(Vectorizer): def __init__(self, data_file: str, bert_version: str): super().__init__(data_file=data_file) self.tokenizer = BertTokenizer.from_pretrained(bert_version) df = pd.read_csv(data_file) self.target_vocab = Vocabulary(add_unk=False) self.target_vocab.add_many(set(df.category)) def vectorize(self, title: str, max_seq_length: int) -> Tuple[np.array, np.array, np.array]: tokens = ["[CLS]"] + self.tokenizer.tokenize(title) + ["[SEP]"] token_type_ids, input_ids = [0] * len(tokens), self.tokenizer.convert_tokens_to_ids(tokens) attention_mask, padding = [1] * len(input_ids), [0] * (max_seq_length - len(input_ids)) input_ids, attention_mask, token_type_ids = [x + padding for x in [input_ids, attention_mask, token_type_ids]] return np.array(input_ids), np.array(attention_mask), np.array(token_type_ids)
def __init__(self, data_file: str): super().__init__(data_file=data_file) self.tokenizer = CharacterTokenizer() df = pd.read_csv(data_file) data_vocab = SequenceVocabulary() target_vocab = Vocabulary(add_unk=False) max_surname = 0 for index, row in df.iterrows(): tokens = self.tokenizer.tokenize(row.surname) max_surname = max(max_surname, len(tokens)) data_vocab.add_many(tokens=tokens) target_vocab.add_token(row.nationality) self.data_vocab = data_vocab self.target_vocab = target_vocab self._max_surname = max_surname + 2
def __init__(self, data_file: str): super().__init__(data_file=data_file) self.tokenizer = CharacterTokenizer() df = pd.read_csv(data_file) data_vocab = Vocabulary(unk_token='@') target_vocab = Vocabulary(add_unk=False) # Add surnames and nationalities to vocabulary for index, row in df.iterrows(): surname = row.surname nationality = row.nationality data_vocab.add_many(tokens=self.tokenizer.tokenize(text=surname)) target_vocab.add_token(token=nationality) self.data_vocab = data_vocab self.target_vocab = target_vocab
def test_vocabulary(self): voc = Vocabulary() self.assertEqual(first=voc.to_serializable(), second={ 'token2id': { '<UNK>': 0 }, 'add_unk': True, 'unk_token': '<UNK>' }) voc = Vocabulary() token = 'Feedly' voc.add_token(token=token) self.assertEqual(first=voc.lookup_token(token=token), second=1) self.assertEqual(first=voc.lookup_index(index=1), second=token) voc = Vocabulary() tokens = ['Feedly', 'NLP'] voc.add_many(tokens=tokens) self.assertEqual(first=voc.lookup_token(token='Feedly'), second=1) self.assertEqual(first=voc.lookup_index(index=1), second='Feedly') self.assertEqual(first=voc.lookup_token(token='NLP'), second=2) self.assertEqual(first=voc.lookup_index(index=2), second='NLP') self.assertEqual(first=voc.to_serializable(), second={ 'token2id': { '<UNK>': 0, 'Feedly': 1, 'NLP': 2 }, 'add_unk': True, 'unk_token': '<UNK>' }) voc = Vocabulary.from_serializable( contents={ 'token2id': { '<UNK>': 0, 'Feedly': 1, 'NLP': 2 }, 'add_unk': True, 'unk_token': '<UNK>' }) self.assertEqual(first=voc.to_serializable(), second={ 'token2id': { '<UNK>': 0, 'Feedly': 1, 'NLP': 2 }, 'add_unk': True, 'unk_token': '<UNK>' }) self.assertEqual(first=len(voc), second=3)
def __init__(self, data_file: str, bert_version: str): super().__init__(data_file=data_file) self.tokenizer = BertTokenizer.from_pretrained(bert_version) df = pd.read_csv(data_file) self.target_vocab = Vocabulary(add_unk=False) self.target_vocab.add_many(set(df.category))