class BPETokenization(Target): def __init__(self, tokenizer_dir: str, max_line_length: Union[int, None] = 50, padding_id: int = 0): super().__init__() assert exists(join(tokenizer_dir, "vocab.json")), f"vocab.json file missing in '{tokenizer_dir}'" assert exists(join(tokenizer_dir, "merges.txt")), f"merges.txt file missing in '{tokenizer_dir}'" self.tokenizer = ByteLevelBPETokenizer(vocab_file=join(tokenizer_dir, "vocab.json"), merges_file=join(tokenizer_dir, "merges.txt")) self.max_line_length = max_line_length self.padding_id = padding_id self.char_re = re.compile(rf"[^{string.printable}]") def __call__(self, document): assert isinstance(document, dict), f"wrong input of type {type(document)} to tokenizer" processed_text = self.char_re.sub("", document["text"]) lines = [line for line in processed_text.split("\n") if len(line) > 0] encoded_lines = [] for line in self.tokenizer.encode_batch(lines): if self.max_line_length is not None: encoded_lines.append(line.ids[:self.max_line_length]) else: encoded_lines.append(line.ids) max_line_length = max(len(line) for line in encoded_lines) for j in range(len(encoded_lines)): amount_padding = max_line_length - len(encoded_lines[j]) encoded_lines[j] = [self.padding_id] * amount_padding + encoded_lines[j] document["input_ids"] = encoded_lines return document
def __init__(self, evaluate: bool = False): tokenizer = ByteLevelBPETokenizer( "./model/bbpe/vocab.json", "./model/bbpe/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) # or use the RobertaTokenizer from `transformers` directly. self.examples = [] src_files = Path("./data/").glob("*_eval.csv") if evaluate else Path( "./data/").glob("*_eval.csv") for src_file in src_files: print("🔥", src_file) with open(src_file, 'r', encoding='utf-8') as f: for index, line in enumerate(f): self.examples += [ x.ids for x in tokenizer.encode_batch(line) ] if index % 10000 == 0: print(src_file, index // 10000)
def __init__(self, evaluate: bool = False): tokenizer = ByteLevelBPETokenizer( "models/faberto/vocab.json", "models/faberto/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) # or use the RobertaTokenizer from `transformers` directly. self.examples = [] src_files = Path("./data/").glob("*-eval.txt") if evaluate else Path( "./data/").glob("*-train.txt") for src_file in src_files: print("🔥", src_file) lines = src_file.open(encoding="utf-8").read().splitlines() self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
def inference(): from tokenizers import ByteLevelBPETokenizer from tokenizers.processors import BertProcessing ''' initialize tokenizer with saved model files ''' tokenizer = ByteLevelBPETokenizer( "./tok_checkpoints/tokenizer_model-vocab.json", "./tok_checkpoints/tokenizer_model-merges.txt", ) ''' optional step : preprocess the strings Ex: add <s> and </s> as BOS and EOS tokens to the string pad string to some max length and truncate string to some max length ''' tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_padding(pad_token='<pad>', pad_id=tokenizer.get_vocab()['<pad>'], length=20) tokenizer.enable_truncation(max_length=20) ''' tokenize/encode strings ''' input_ids = tokenizer.encode("Hello World, Whats up!!!").ids print("input ids", input_ids) tokens = tokenizer.encode("Hello World, Whats up!!!").tokens print("tokens", tokens) ''' tokenize/encode batch of string ''' batch_tokenized = tokenizer.encode_batch( ["Hello World, Whats up!!!", "Whata whata wa wada wada"]) input_ids = [i.ids for i in batch_tokenized] print("input ids", input_ids) tokens = [i.tokens for i in batch_tokenized] print("tokens", tokens)
def __init__(self, evaluate: bool = False): tokenizer = ByteLevelBPETokenizer( vocab_file, merges_file ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) self.examples = [] src_files = Path(data_folder).glob("**/*.txt") for src_file in src_files: print("🇩🇰", src_file) lines = src_file.read_text(encoding="utf-8").splitlines() self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
class WikiText2DataModule(pl.LightningDataModule): def __init__(self, data_dir: str = 'data/wikitext-2', train_batch_size: int = 64, val_batch_size: int = 64, dataloader_num_workers: int = 4, seq_length: int = 64, vocab_size=30000): super().__init__() self.train_batch_size = train_batch_size self.val_batch_size = val_batch_size self.dataloader_num_workers = dataloader_num_workers self.seq_length = seq_length self.vocab_size = vocab_size self.tokenizer = ByteLevelBPETokenizer(add_prefix_space=True) def prepare_data(self, *args, **kwargs): dataset = load_dataset("wikitext", "wikitext-103-raw-v1", split="train+test+validation") column_names = dataset.column_names def batch_iterator(batch_size=1000): for i in range(0, len(dataset), batch_size): yield dataset[i:i + batch_size]["text"] if (not os.path.exists("data/wiki-vocab.json")) or ( not os.path.exists("data/wiki-merges.txt")): print('TRAIN TOKENIZER') self.tokenizer.train_from_iterator(batch_iterator(), vocab_size=self.vocab_size) self.tokenizer.save_model("data/", "wiki") else: self.tokenizer = ByteLevelBPETokenizer("data/wiki-vocab.json", "data/wiki-merges.txt", add_prefix_space=True) dataset = load_dataset("wikitext", "wikitext-103-raw-v1") def tokenize_function(examples): return { 'input_ids': list( map(lambda x: x.ids, self.tokenizer.encode_batch(examples['text']))) } self.tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=column_names, num_proc=4) def setup(self, stage: Optional[str] = None): # datasets = load_dataset('text', # data_dir=self.data_dir, # data_files={'train': 'wiki.train.small.raw', # 'valid': 'wiki.valid.small.raw'}) def group_text(examples): # Concatenate all texts. concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. total_length = (total_length // self.seq_length) * self.seq_length # Split by chunks of max_len. result = { k: [ t[i:i + self.seq_length] for i in range(0, total_length, self.seq_length) ] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() return result lm_dataset = self.tokenized_dataset.map(group_text, batched=True, num_proc=4) train_dataset = lm_dataset['train'] eval_dataset = lm_dataset['validation'] self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.test_dataset = lm_dataset['test'] def collate_fn(self, features): batch = {} batch['inputs_ids'] = torch.tensor([f['input_ids'] for f in features], dtype=torch.long) batch['labels'] = batch['inputs_ids'] return batch def train_dataloader(self) -> DataLoader: return DataLoader(self.train_dataset, batch_size=self.train_batch_size, collate_fn=self.collate_fn, num_workers=self.dataloader_num_workers) def val_dataloader(self) -> DataLoader: return DataLoader(self.eval_dataset, batch_size=self.val_batch_size, collate_fn=self.collate_fn, num_workers=self.dataloader_num_workers) def test_dataloader(self) -> DataLoader: return DataLoader(self.test_dataset, batch_size=self.val_batch_size, collate_fn=self.collate_fn, num_workers=self.dataloader_num_workers)
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) end = time.time() print("L'entraînement a pris :", round((end - start) / 60, 2), ' minutes.') # Save files to disk tokenizer.save_model(chemin_modele, "Token_BERT_MTE") # %% print(tokenizer.encode("environnement"), tokenizer.encode("environnement")) # %% tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) #%% examples = [] for src_file in Path(chemin_donnees).glob("**/*.txt"): print("🔥", src_file) lines = src_file.read_text(encoding="utf-8").splitlines() examples += [x.ids for x in tokenizer.encode_batch(lines)] # %%
class ReferenceParser(Target): def __init__(self, model_dir, device: str = "cpu"): super().__init__() self.model_dir = abspath(model_dir) assert exists(self.model_dir ), f"model directory '{self.model_dir}' does not exist" assert exists( join(self.model_dir, "config.json" )), f"configuration file does not exist in {self.model_dir}" assert exists(join( self.model_dir, "merges.txt")), f"merges file does not exist in {self.model_dir}" assert exists(join( self.model_dir, "weights.pt")), f"model file does not exist in {self.model_dir}" assert exists(join( self.model_dir, "vocab.json")), f"vocab file does not exist in {self.model_dir}" with open(join(self.model_dir, "config.json"), "r") as config_file: self.model_config = json.load(config_file) if not torch.cuda.is_available(): device = "cpu" self.device = torch.device(device) self.model = LSTMTagger( vocab_size=self.model_config["vocab_size"], embedding_dim=self.model_config["embedding_dim"], lstm_dim=self.model_config["lstm_dim"], device=device).to(self.device) weights = torch.load(join(self.model_dir, "weights.pt"), map_location=device) self.model.load_state_dict(weights) self.model = self.model.eval() self.tokenizer = ByteLevelBPETokenizer( vocab_file=join(self.model_dir, "vocab.json"), merges_file=join(self.model_dir, "merges.txt"), lowercase=self.model_config["lowercase"]) # self.space_re = re.compile(r"\s+") self.refer_re = re.compile( r"([(\[](\w[^\d()\[\]]+\s[(\[]?[12][0-9]{3}[A-Za-z]?[(\[]?[;,\s]*)+[)\]])" ) self.et_al_re = re.compile( r"[^\d\s]+\set\.?\s*al\.?,?\s+([(\[][12][0-9]{3}[)\]])?") self.and_re = re.compile( r"([^\d\s]+,?\s+((and)|&)\s+)+[^\d\s]+[,\s]*([(\[][12][0-9]{3}[A-Za-z]?[)\]]|[12][0-9]{3}[A-Za-z]?)" ) self.name_re = re.compile(r"[A-Z][^A-Z\d\s()\[\],;\.]+") def __call__(self, document): assert isinstance( document, dict ), f"wrong input of type {type(document)} to reference extractor" try: self.find_reference_blocks(document) except RuntimeError: logging.error( f"could not parse reference blocks of {document['name']}") text_references = self.find_text_references(document) # remove references from text, sort to match longest first sorted_ref = sorted(text_references, key=lambda ref: len(ref[0]), reverse=True) for reference in sorted_ref: self.clean_text(document, reference[0]) return document def find_reference_blocks(self, document): lines = document["text_cleaned"].split("\n") labels = [] # chunk text and prepare for model for i in range(0, len(lines), self.model_config["chunk_size"]): chunk = [ x.ids for x in self.tokenizer.encode_batch( lines[i:i + self.model_config["chunk_size"]]) ] # padding max_line = max(len(line) for line in chunk) for j in range(len(chunk)): chunk[j] = [0] * (max_line - len(chunk[j])) + chunk[j] chunk = torch.Tensor([chunk]).long().to(self.device) predictions = self.model.forward(chunk, i, len(lines)) labels.extend(torch.argmax(predictions[0], -1)) keep_lines = [] last_label, last_block = 0, [] for line, label in zip(lines, labels): if last_label == 1 and label == 0 and len(last_block) > 0: document["entities"][Entity.REFERENCE].add( "\n".join(last_block)) last_block = [] if label == 0: keep_lines.append(line) else: last_block.append(line) last_label = label if len(last_block) > 0: document["entities"][Entity.REFERENCE].add("\n".join(last_block)) document["text_cleaned"] = "\n".join(keep_lines) def find_text_references(self, document): # find references based on different regular expressions references = [] for x in self.refer_re.finditer(document["text_cleaned"]): start, end = x.span() references.append( (document["text_cleaned"][start:end], start, end)) for x in self.et_al_re.finditer(document["text_cleaned"]): start, end = x.span() references.append( (document["text_cleaned"][start:end], start, end)) for x in self.and_re.finditer(document["text_cleaned"]): start, end = x.span() references.append( (document["text_cleaned"][start:end], start, end)) # remove overlapping references for refer_a, refer_b in combinations(references, 2): if refer_a[1] <= refer_b[1] < refer_a[2] and refer_b[2] <= refer_a[ 2]: references.remove(refer_a) references.remove(refer_b) start = min(refer_a[1], refer_b[1]) end = max(refer_a[2], refer_b[2]) references.append( (document["text_cleaned"][start:end], start, end)) # extract names of authors in references and add reference entity document["references_authors"] = set() for reference in references: for x in self.name_re.findall(reference[0]): document["references_authors"].add(x) document["entities"][Entity.REFERENCE].add(reference[0]) return references
class PreAbstractParser(Target): def __init__(self, model_dir, device="cpu"): super().__init__() self.model_dir = abspath(model_dir) assert exists(self.model_dir ), f"model directory '{self.model_dir}' does not exist" assert exists(join(self.model_dir, "classes.json") ), f"classes file does not exist in {self.model_dir}" assert exists( join(self.model_dir, "config.json" )), f"configuration file does not exist in {self.model_dir}" assert exists(join( self.model_dir, "merges.txt")), f"merges file does not exist in {self.model_dir}" assert exists(join( self.model_dir, "weights.pt")), f"weights file does not exist in {self.model_dir}" assert exists(join( self.model_dir, "vocab.json")), f"vocab file does not exist in {self.model_dir}" with open(join(self.model_dir, "classes.json"), "r") as classes_file: self.class_to_index = json.load(classes_file) self.index_to_class = { v: k for k, v in self.class_to_index.items() } with open(join(self.model_dir, "config.json"), "r") as config_file: self.model_config = json.load(config_file) if not torch.cuda.is_available(): device = "cpu" self.device = torch.device(device) self.model = LSTMTagger( vocab_size=self.model_config["vocab_size"], embedding_dim=self.model_config["embedding_dim"], lstm_dim=self.model_config["lstm_dim"], n_classes=len(self.class_to_index)).to(self.device) weights = torch.load(join(self.model_dir, "weights.pt"), map_location=device) self.model.load_state_dict(weights) self.model = self.model.eval() self.tokenizer = ByteLevelBPETokenizer( vocab_file=join(self.model_dir, "vocab.json"), merges_file=join(self.model_dir, "merges.txt"), lowercase=self.model_config["lowercase"]) self.noise_re = re.compile(r"[^A-Za-z ]") self.department_re = re.compile(r"(?:,\s*)?[^,]*Department[^,]*(?:,)", re.IGNORECASE) def __call__(self, document): assert isinstance( document, dict), f"wrong input of type {type(document)} to author parser" try: lines, labels = self.annotate_lines( document["text"][:document["abstract_start"]]) except RuntimeError: logging.error( f"could not parse pre abstract of {document['name']}") return document keep_lines = [] for line, label in zip(lines, labels): if "meta" in document and self.noise_re.sub( "", line) == self.noise_re.sub("", document["meta"]["title"]): keep_lines.append(line) elif label == "other": keep_lines.append(line) else: self.create_annotation(document, line, label) if "meta" in document: keep_lines = self.post_process_lines(document, keep_lines) document["text_cleaned"] = "\n".join( keep_lines) + document["text"][document["abstract_start"]:] return document def annotate_lines(self, text): lines = text.split("\n") tokenized = [x.ids for x in self.tokenizer.encode_batch(lines)] # padding max_tokens = max(len(sentence) for sentence in tokenized) for sentence in range(len(tokenized)): for _ in range(max_tokens - len(tokenized[sentence])): tokenized[sentence].insert(0, 0) tensor = torch.tensor([tokenized]).to(self.device) predictions = self.model.forward(tensor) predictions = torch.argmax(predictions[0], -1) predictions = [ self.index_to_class[prediction.item()] for prediction in predictions ] return lines, predictions def create_annotation(self, document, line, label): if label == "private": document["entities"][Entity.PERSONAL_DATA].add(line) elif label == "author": document["entities"][Entity.AUTHOR].add(line) elif label == "email": document["entities"][Entity.EMAIL].add(line) elif label == "organization": for department_mention in self.department_re.findall(line): document["entities"][Entity.PERSONAL_DATA].add( department_mention) line = self.department_re.sub("", line) document["entities"][Entity.INSTITUTION_COMPANY].add(line) else: logging.error(f"label '{label}' not recognized in {type(self)}") raise ValueError(f"label '{label}' not recognized") def post_process_lines(self, document, lines): keep_lines = [] for line in lines: mention = False try: for author in document["meta"]["authors"]: if re.search( "[\s\-]*".join( re.escape(name) for name in author.split()), line, re.IGNORECASE): mention = True document["entities"][Entity.AUTHOR].add(line) for organization in document["meta"]["orgs"]: if re.search( "[\s\-]*".join( re.escape(name) for name in organization["name"].split()), line, re.IGNORECASE): mention = True document["entities"][Entity.INSTITUTION_COMPANY].add( line) except KeyError: logging.error( f"conferences meta file misses key for {document['name']}") if not mention: keep_lines.append(line) return keep_lines