def test_train_from_iterator(self): text = ["A first sentence", "Another sentence", "And a last one"] tokenizer = SentencePieceBPETokenizer() tokenizer.train_from_iterator(text, show_progress=False) output = tokenizer.encode("A sentence") assert output.tokens == ["▁A", "▁sentence"]
def fit_on_text(self, text): if self.lower: text = text.lower() words = text.split() tokenizer1 = SentencePieceBPETokenizer(vocab, merges) for word in words: for sub_word in tokenizer1.encode(word).tokens: if sub_word not in self.word2idx: self.word2idx[sub_word] = self.idx self.idx2word[self.idx] = sub_word self.idx += 1
def __init__(self, fname, tokenizer,dat_fname): if os.path.exists(dat_fname): print('loading dataset:', dat_fname) self.data = pickle.load(open(dat_fname, 'rb')) else: tokenizer1 = SentencePieceBPETokenizer(vocab, merges) reader = pd.read_excel(fname) all_data = [] for i in range(reader.shape[0]): text_raw1=[] text_raw2=[] column_name1 = tokenizer1.encode(reader.iloc[i][0].lower().strip()).tokens [text_raw1.extend(tokenizer1.encode(x).tokens) for x in reader.iloc[i][2].lower().strip().split(' ')] column_name2 = tokenizer1.encode(reader.iloc[i][1].lower().strip()).tokens [text_raw2.extend(tokenizer1.encode(x).tokens) for x in reader.iloc[i][3].lower().strip().split(' ')] class_n = reader.iloc[i][4] text_raw_indices1 = tokenizer.text_to_sequence(text_raw1) aspect_indices1 = tokenizer.text_to_sequence(column_name1) text_raw_indices2 = tokenizer.text_to_sequence(text_raw2) aspect_indices2 = tokenizer.text_to_sequence(column_name2) data = { 'text_raw_indices1': text_raw_indices1, 'aspect_indices1': aspect_indices1, 'text_raw_indices2': text_raw_indices2, 'aspect_indices2': aspect_indices2, 'class_n': int(class_n), } all_data.append(data) self.data = all_data pickle.dump(self.data, open(dat_fname, 'wb')) print("Finished write data file")
class TokenizerWrapper: def __init__(self, tok_type, unk_token, sep_token, cls_token, pad_token, mask_token): self.tok_type = tok_type if self.tok_type == 'bpe': self.tokenizer = ByteLevelBPETokenizer() elif self.tok_type == 'wordpiece': self.tokenizer = BertWordPieceTokenizer(unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token) elif self.tok_type == 'sentencepiece': self.tokenizer = SentencePieceBPETokenizer(unk_token=unk_token) def train(self, data_file, vocab_size, special_tokens): if self.tok_type in ['bpe', 'wordpiece', 'sentencepiece']: self.tokenizer.train([data_file], vocab_size=vocab_size, special_tokens=special_tokens) def tokenize(self, text): if self.tok_type in ['bpe', 'wordpiece', 'sentencepiece']: return self.tokenizer.encode(text).tokens elif self.tok_type == 'word': return nltk.tokenize.word_tokenize(text) elif self.tok_type == 'char': return [c for c in text] else: raise Exception('Unknown tokenizer: ' + self.tok_type) def decode(self, tokens, blank_token): if self.tok_type in ['bpe', 'wordpiece', 'sentencepiece']: ids = [self.tokenizer.token_to_id(t) for t in tokens] ids = [ i if i != None else self.tokenizer.token_to_id(blank_token) for i in ids ] return self.tokenizer.decode(ids, skip_special_tokens=False) elif self.tok_type == 'word': return ' '.join(tokens) elif self.tok_type == 'char': return ''.join(tokens) else: raise Exception('Unknown tokenizer: ' + self.tok_type)
class DecodeBySentencePieceBPETokenizer(Preprocessor): __provider__ = 'decode_by_sentence_piece_bpe_tokenizer' @classmethod def parameters(cls): parameters = super().parameters() parameters.update({ 'vocabulary_file': PathField(), 'merges_file': PathField(), 'sos_symbol': StringField(optional=True, default='<s>'), 'eos_symbol': StringField(optional=True, default='</s>'), 'add_extra_symbols': BoolField(optional=True, default=True), }) return parameters def configure(self): if isinstance(SentencePieceBPETokenizer, UnsupportedPackage): SentencePieceBPETokenizer.raise_error(self.__provider__) self.tokenizer = SentencePieceBPETokenizer( str(self.get_value_from_config('vocabulary_file')), str(self.get_value_from_config('merges_file'))) self.add_extra_symbols = self.get_value_from_config( 'add_extra_symbols') self.idx = {} for s in ['sos', 'eos']: self.idx[s] = self.tokenizer.token_to_id( str(self.get_value_from_config(s + '_symbol'))) def process(self, image, annotation_meta=None): sentence = " ".join(image.data) tokens = self.tokenizer.encode(sentence).ids if self.add_extra_symbols: tokens = [self.idx['sos']] + tokens + [self.idx['eos']] image.data = tokens image.metadata['decoded'] = True image.identifier = "tokens" return image
def build_bpe(vocab_size=10000): # Initialize a tokenizer tokenizer = SentencePieceBPETokenizer() #mypath = "../../Downloads/riksdagens_protokoll_1920-2020/annual" mypath = "../../Desktop/cood/python/machine-learning/old-school/markov-lstm-killer/data/fi" onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] print("ONL", onlyfiles) paths = [mypath + "/" + f for f in onlyfiles] #paths = paths[:5] # COPY FILES txts = [] for path, fname in zip(paths, onlyfiles): if path[-4:] == ".txt": localpath = "data/" + fname txts.append(localpath) infile = open(path) outfile = open(localpath, "w") for line in infile: clean_line = cleanup(line) + "\n" outfile.write(clean_line) outfile.close() # Then train it! #tokenizer.train([ "../../Downloads/riksdagens_protokoll_1920-2020/annual/prot_2019.txt" ], vocab_size=15000) tokenizer.train(txts, vocab_size=vocab_size) # Now, let's use it: s = "Det politiska arbetet har redan börjat på olika sätt, med resor, besök, möten, politikutveckling, motionsskrivande och mycket annat. Jag har sett att ni redan har varit aktiva under ett antal veckor, och jag kan försäkra er att det även gäller talmanspresidiet. Nu är det dags att med tillförsikt påbörja ett nytt riksdagsår. Jag hoppas att ni alla ser fram emot det lika mycket som jag gör." #s = "Ite en oo viel mitää hyvää kyl sielt syöny." #s = "ja kieltämät siihe tommoste kokonaisii sanoi merkitsevät tavumerkit on huomattavasti näppärämpii ku ääniä tarkottavat aakkoset joist pitää rakentaa jokane sana" encoded = tokenizer.encode(s) print(encoded.ids) print(encoded.tokens) # And finally save it somewhere tokenizer.save("./bpe-fi.tokenizer.json")
if args.half: model.half() model.to(device) start = time.time() count = 0 with jsonlines.open(args.output, mode='w', flush=True) as writer: for id, utt in tqdm.tqdm(data.items(), desc="Texts"): if id in processed_ids: continue texts = utt['candidates'] texts_ids = [] for text in texts: if args.opi: ids = torch.tensor([tokenizer.encode(text).ids]) else: ids = tokenizer.encode(text, return_tensors="pt") texts_ids.append(ids) utt['probas'] = [] for i, text_ids in enumerate(tqdm.tqdm(texts_ids, desc="Cands")): logprob, length, logprob_wo0 = text_prob(text_ids, tokenizer.mask_token_id, model, device, args.batch_size) utt['probas'].append((logprob, length, logprob_wo0)) writer.write(utt) end = time.time() print('Time', end - start, count, file=sys.stderr)
class Tokenizer: """ Sentence tokenizer. Arguments: path (str): path to tokenizer's model folder. max_tokens (int): max tokens. """ def __init__(self, path, max_tokens): self.logger = log.getLogger("Tokenizer") self.logger.info("loading tokenizer") self.logger.info("path: " + path) self.logger.info("max_tokens: " + str(max_tokens)) self.tokenizer = SentencePieceBPETokenizer( os.path.join(path, "vocab.json"), os.path.join(path, "merges.txt") ) self.max_tokens = max_tokens self.idx = {} for s in ['</s>', '<s>', '<pad>']: self.idx[s] = self.tokenizer.token_to_id(s) def encode(self, sentence): """ Encode method for sentence. Arguments: sentence (str): sentence. Returns: tokens (np.array): encoded sentence in tokneized format. """ tokens = self.tokenizer.encode(sentence).ids return self._extend_tokens(tokens) def decode(self, tokens, remove_repeats=True): """ Decode method for tokens. Arguments: tokens (np.array): sentence in tokenized format. remove_repeats (bool): remove repeated words. Returns: sentence (str): output sentence. """ sentence = self.tokenizer.decode(tokens) for s in self.idx.keys(): sentence = sentence.replace(s, '') if remove_repeats: sentence = self._remove_repeats(sentence) return sentence.lstrip() def _extend_tokens(self, tokens): """ Extend tokens. Arguments: tokens (np.array): sentence in tokenized format. Returns: tokens (np.array): extended tokens. """ tokens = [self.idx['<s>']] + tokens + [self.idx['</s>']] pad_length = self.max_tokens - len(tokens) if pad_length > 0: tokens = tokens + [self.idx['<pad>']] * pad_length return tokens def _remove_repeats(self, sentence): """ Remove repeated words. Arguments: sentence (str): sentence. Returns: sentence (str): sentence in lowercase without repeated words. """ tokens = sentence.lower().split() return " ".join(key for key, _ in itertools.groupby(tokens))
class BPEVocabulary(Vocabulary): """ Represents a SentencePiece vocabulary for c2s. """ def __init__(self, args: Namespace): super().__init__() self.target_encoder = SentencePieceBPETokenizer( args.target_vocab, args.target_merges) self.subtoken_encoder = SentencePieceBPETokenizer( args.subtoken_vocab, args.subtoken_merges) # self.target_encoder.add_special_tokens( # [self.EOS_TOKEN, self.SOS_TOKEN, self.PAD_TOKEN] # ) # self.subtoken_encoder.add_special_tokens([self.EOS_TOKEN, self.PAD_TOKEN]) with open(args.node_dict, "rb") as f: self.node_to_index = pickle.load(f) self.index_to_node = {v: k for k, v in self.node_to_index.items()} def target_vocab_size(self): # print(self.target_encoder.num_special_tokens_to_add()) return self.target_encoder.get_vocab_size() + 4 def node_vocab_size(self): # print(self.target_encoder.num_special_tokens_to_add()) return len(self.node_to_index) + 2 def terminal_vocab_size(self): return self.subtoken_encoder.get_vocab_size() + 4 def add_special_target_token(self, token: str): self.target_encoder.add_special_tokens([token]) def add_special_terminal_token(self, token: str): self.subtoken_encoder.add_special_tokens([token]) def encode_node(self, token_or_tokens): if isinstance(token_or_tokens, str): return self.node_to_index.get(token_or_tokens, self.node_to_index[self.UNK_TOKEN]) else: return list(map(self.encode_node, token_or_tokens)) def decode_node(self, index_or_indices): if isinstance(index_or_indices, int): return self.index_to_node[index_or_indices] else: return list(map(self.decode_node, index_or_indices)) def encode_target(self, token_or_tokens): if isinstance(token_or_tokens, str): return self.target_encoder.token_to_id(token_or_tokens) else: return self.target_encoder.encode(" ".join(token_or_tokens)).ids def decode_target(self, index_or_indices): if isinstance(index_or_indices, int): return self.target_encoder.id_to_token(index_or_indices) else: return self.target_encoder.decode(index_or_indices) def encode_terminal(self, token_or_tokens): if isinstance(token_or_tokens, str): return self.subtoken_encoder.token_to_id(token_or_tokens) else: return self.subtoken_encoder.encode(" ".join(token_or_tokens)).ids def decode_terminal(self, index_or_indices): if isinstance(index_or_indices, int): return self.terminal_encoder.id_to_token(index_or_indices) else: return self.terminal_encoder.decode(index_or_indices)
class AGBDataReader(object): """ Reads in the Heidelberg AGB dataset """ def __init__(self, dataset_folder, tokenizer_method): self.dataset_folder = dataset_folder self.tokenizer_method = tokenizer_method if tokenizer_method == "sentencepiece": self.tokenizer = SentencePieceBPETokenizer( "./data/sentencepiece_tokenizer/vocab.json", "./data/sentencepiece_tokenizer/merges.txt") elif tokenizer_method == "bert": self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') def get_examples(self, filename, max_seq_length=1024, max_examples=0, read_cache=False): """ data_splits specified which data split to use atrain, dev, test). Expects that self.dataset_folder contains the files in tsv (tab-separated form), with three columns (s1 \t s2 \t [0|1] """ if self.tokenizer_method == "bert": self.tokenizer.max_len = max_seq_length #load from saved features to save time if read_cache: name = filename.replace(".tsv", "") if max_examples > 0: name + "_" + str(max_examples) with open('./data/dataset_' + name + ".pickle", 'rb') as file: dataset = pickle.load(file) else: with open(os.path.join(self.dataset_folder, filename)) as f: rows = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE) first_sections = [] second_sections = [] labels = [] id = 0 for sentence_a, sentence_b, label in rows: id += 1 if self.tokenizer_method == "sentencepiece": sentence_a = self.tokenizer.encode(sentence_a).ids sentence_b = self.tokenizer.encode(sentence_b).ids padding = [0] * (max_seq_length - len(sentence_a)) sentence_a += padding padding = [0] * (max_seq_length - len(sentence_b)) sentence_b += padding first_sections.append( np.array(sentence_a[:max_seq_length])) second_sections.append( np.array(sentence_b[:max_seq_length])) elif self.tokenizer_method == "bert": sentence_a = self.tokenizer.encode( sentence_a, max_length=max_seq_length, return_tensors='pt') sentence_b = self.tokenizer.encode( sentence_b, max_length=max_seq_length, return_tensors='pt') sentence_a_pads = self.tokenizer.max_len - sentence_a.shape[ -1] sentence_b_pads = self.tokenizer.max_len - sentence_b.shape[ -1] sentence_a = F.pad(sentence_a, pad=(0, sentence_a_pads), value=self.tokenizer.pad_token_id) sentence_b = F.pad(sentence_b, pad=(0, sentence_b_pads), value=self.tokenizer.pad_token_id) first_sections.append(sentence_a) second_sections.append(sentence_b) labels.append(self.map_label(label)) if 0 < max_examples <= len(first_sections): break dataset = TensorDataset( torch.LongTensor( np.stack(first_sections, axis=0).reshape( (-1, max_seq_length))), torch.LongTensor( np.stack(second_sections, axis=0).reshape( (-1, max_seq_length))), torch.FloatTensor(np.array(labels))) #save the features name = filename.replace(".tsv", "") if max_examples > 0: name + "_" + str(max_examples) with open('./data/dataset_' + name + ".pickle", 'wb') as file: pickle.dump(dataset, file, protocol=4) return dataset @staticmethod def get_labels(): # Adding different types of labels to assert correct conversion return { "same_section": 1, "other_section": 0, "1": 1, "0": 0, 1: 1, 0: 0 } def get_num_labels(self): return len(self.get_labels()) def map_label(self, label): return self.get_labels()[label.strip().lower()]
# Initialize an empty tokenizer tokenizer = SentencePieceBPETokenizer(add_prefix_space=True) # And then train tokenizer.train( files, vocab_size=args.vocab_size, min_frequency=2, show_progress=True, special_tokens=['<unk>'], limit_alphabet=1000 ) # Save the files tokenizer.save(args.out, args.name) # Restoring model from learned vocab/merges tokenizer = SentencePieceBPETokenizer( join(args.out, '{}-vocab.json'.format(args.name)), join(args.out, '{}-merges.txt'.format(args.name)), add_prefix_space=True ) # Test encoding logger.info('Tokens and their ids from SentencePiece with GFP protein sequence: \n MSKGEE LFTGVVPILVELDGDVNGHKFSVSGEGEG DAT') encoded = tokenizer.encode('MSKGEE LFTGVVPILVELDGDVNGHKFSVSGEGEG DAT') logger.info(encoded.tokens) logger.info(encoded.ids) logger.info('done!')
class Doc(object): def __init__(self, storage_method, force_shorten, data_dir, tokenizer_path): self.all_lens = {} self.num_labels = None self.final_data = [] self.final_triplet_data = [] self.tokenizer_path = tokenizer_path self.storage_method = storage_method self.force_shorten = force_shorten self.tokenizer = None self.set_tokenizer() # Create data directory os.makedirs(data_dir, exist_ok=True) self.data_dir = data_dir def set_tokenizer(self): if self.storage_method == "raw": pass # Essentially keep it None. Important for exceptions elif self.storage_method == "bert": self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") elif self.storage_method == "roberta": self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base") elif self.storage_method == "token": self.tokenizer = SentencePieceBPETokenizer( os.path.join(self.tokenizer_path, "/vocab.json"), os.path.join(self.tokenizer_path, "merges.txt")) else: raise ValueError("Unknown storage method encountered!") def transform_text_accordingly(self, text): if self.storage_method == "raw": return text elif self.storage_method == "bert" or self.storage_method == "roberta": encoded_text = self.tokenizer.encode(text) # Shorten if necessary if self.force_shorten and len(encoded_text) > 512: # Still need the last token return encoded_text[:511] + [encoded_text[-1]] else: return encoded_text else: # the case for our own embedding encoded_text = self.tokenizer.encode(text).ids if self.force_shorten and len(encoded_text) > 512: return encoded_text[: 512] # Own encoding has no special symbols else: return encoded_text def _add_samples_triplet_loss(self, section_pos, section_neg, section): self.final_triplet_data.append({ "section_center": section, "section_pos": section_pos, "section_neg": section_neg }) def generate_positive_samples(self, label, section, doc): second_section = self._add_samples(label, section, doc, 1) return second_section def generate_negative_samples(self, label, section, doc): # choose a random label as negative rand_neg_label = label while rand_neg_label == label: rand_neg_label = random.choice(list(doc.keys())) second_section = self._add_samples(rand_neg_label, section, doc, 0) return second_section
class TextProcessor: def __init__(self, tok_model_path: Optional[str] = None): self.languages = {} if tok_model_path is not None: self.tokenizer = SentencePieceBPETokenizer( tok_model_path + "/vocab.json", tok_model_path + "/merges.txt", ) with open(os.path.join(tok_model_path, "langs"), "rb") as fp: self.languages: Dict[str, int] = pickle.load(fp) self.init_properties(self.languages) def init_properties(self, languages: Dict[str, int] = None): self.max_len = 512 self.pad_token = "<pad>" self.mask_token = "<mask>" self.unk_token = "<unk>" self.sep_token = "</s>" self.bos = "<s>" self.special_tokens = [ self.pad_token, self.bos, self.unk_token, self.mask_token, self.sep_token ] + list(languages.keys()) self.languages = languages def train_tokenizer(self, paths: List[str], vocab_size: int, to_save_dir: str, languages: Dict[str, int]): self.tokenizer = SentencePieceBPETokenizer() self.init_properties(languages) self.tokenizer.train(files=paths, vocab_size=vocab_size, min_frequency=5, special_tokens=self.special_tokens) self.save(directory=to_save_dir) def _tokenize(self, line) -> Encoding: return self.tokenizer.encode(line) def save(self, directory): self.tokenizer.save(directory) with open(os.path.join(directory, "langs"), "wb") as fp: pickle.dump(self.languages, fp) def tokenize_one_line(self, line, ignore_middle_eos: bool = False) -> List[int]: tokenized = [] spl = [sen for sen in line.split("</s>") if len(sen.strip()) > 0] if spl[0].startswith("<"): words = spl[0].strip().split(" ") spl[0] = " ".join(words[1:]) tokenized += [self.token_id(words[0])] for sen in spl: tokenized += self._tokenize(sen).ids if not ignore_middle_eos: tokenized += [self.sep_token_id()] if ignore_middle_eos: tokenized += [self.sep_token_id()] return tokenized def tokenize_one_sentence(self, line) -> List[int]: """ Assume that the sentence has language id in the first token and end of sentence as the end! :param line: :return: """ spl = line.strip().split(" ") lang_id, sen, eos = spl[0], " ".join(spl[1:-1]), spl[-1] tokenized = [self.token_id(lang_id) ] + self._tokenize(sen).ids + [self.token_id(eos)] return tokenized def tokenize_lines(self, line, blind_split: bool = False, split_len: int = 512) -> List[List[int]]: """ :param line: :param blind_split: If True, just splits the tokenized data into chunks without considering that every vector should start with a first word in sentence. :return: """ tokenized = [] if len(self.languages) > 0: spl = [sen for sen in line.split("</s>") if len(sen.strip()) > 0] lang_id = [] if spl[0].startswith("<"): words = spl[0].strip().split(" ") lang_id = [self.token_id(words[0])] spl[0] = " ".join(words[1:]) max_len = 0 for sen in spl: toks = self._tokenize(sen).ids tokenized += lang_id + toks + [self.sep_token_id()] max_len = max(max_len, len(toks) + 1) else: tokenized = self._tokenize(line.strip()).ids if blind_split: num_pads = (split_len - (len(tokenized) % split_len)) pad_arr = [self.pad_token_id()] * num_pads tokenized = np.array(tokenized + pad_arr) reshaped = tokenized.reshape((-1, split_len)) return reshaped else: return self.split_tokenized(tokenized, min(max_len, self.max_len)) def tokenize(self, lines) -> List[List[int]]: lines = [ line.strip() for line in lines.strip().split("\n") if len(line.strip()) > 0 ] tokenized = self.tokenizer.encode_batch(lines) return [tok.ids for tok in tokenized] def pad_token_id(self) -> int: return self.tokenizer.token_to_id(self.pad_token) def mask_token_id(self) -> int: return self.tokenizer.token_to_id(self.mask_token) def unk_token_id(self) -> int: return self.tokenizer.token_to_id(self.unk_token) def bos_token_id(self) -> int: return self.tokenizer.token_to_id(self.bos) def sep_token_id(self) -> int: return self.tokenizer.token_to_id(self.sep_token) def token_id(self, token: str) -> int: tok_id = self.tokenizer.token_to_id(token) if tok_id is None: return 0 return tok_id def id2token(self, id: int) -> str: return self.tokenizer.id_to_token(id) def vocab_size(self) -> int: return self.tokenizer.get_vocab_size() def is_lang(self, id) -> bool: return self.tokenizer.id_to_token(id) in self.languages def lang_id(self, tok): if tok in self.languages: return self.languages[tok] return 0 def split_tokenized(self, tokenized: List[int], max_length: int = 512) -> List[List[int]]: """ Based on self.max_len, splits very long sequences to smaller ones. Here we assume to not have any overlapping sequences. If the first token is a language, we add it to every new sequence. :return: """ if len(tokenized) <= max_length: sequences = [tokenized] sequences[-1] = sequences[-1] + ( max_length - len(sequences[-1])) * [self.pad_token_id()] return sequences has_lang = self.is_lang(tokenized[0]) sequence = tokenized[0:] if has_lang else tokenized seq_len = len(sequence) sep_id = self.sep_token_id() max_len = max_length - 1 if has_lang else max_length cur_start = 0 sequences = [] built_seq = [] truncated = False # Shows if previous sequence is truncated due to its length. used_ends = set() while cur_start < seq_len: if not truncated or not has_lang: cur_end = min(seq_len, cur_start + max_len) else: cur_end = min(seq_len, cur_start + max_len + 1) subseq = sequence[cur_start:cur_end] built_seq += subseq sep_positions = [ i for i, id in enumerate(built_seq) if id == sep_id ] if len(sep_positions) > 0: if sep_positions[-1] in used_ends: truncated = True else: built_seq = built_seq[:sep_positions[-1] + 1] truncated = False else: truncated = True assert built_seq[-1] == sequence[len(built_seq) - 1] if has_lang and len(subseq) < max_len + 1: subseq = [tokenized[0]] + subseq sequences.append(subseq) cur_start = len(built_seq) used_ends.add(cur_start - 1) if len(sequences[-1]) < max_length: sequences[-1] = sequences[-1] + ( max_length - len(sequences[-1])) * [self.pad_token_id()] assert built_seq[-1] == sequence[len(built_seq) - 1] return sequences