def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path): assert os.path.isfile(file_path) logger.info("Creating features from dataset file at %s", file_path) cache_fn = f'{file_path}.cache' if args.cache_data and os.path.isfile( cache_fn) and not args.overwrite_cache: logger.info("Loading cached data from %s", cache_fn) self.examples = torch.load(cache_fn) else: self.examples = [] with open(file_path, encoding="utf-8") as f: for line in f.readlines(): if len(line) > 0 and not line.isspace() and len( line.split(' ||| ')) == 2: try: src, tgt = line.split(' ||| ') if src.rstrip() == '' or tgt.rstrip() == '': continue except: logger.info("Skipping instance %s", line) continue sent_src, sent_tgt = src.strip().split(), tgt.strip( ).split() token_src, token_tgt = [ tokenizer.tokenize(word) for word in sent_src ], [tokenizer.tokenize(word) for word in sent_tgt] wid_src, wid_tgt = [ tokenizer.convert_tokens_to_ids(x) for x in token_src ], [ tokenizer.convert_tokens_to_ids(x) for x in token_tgt ] ids_src, ids_tgt = tokenizer.prepare_for_model( list(itertools.chain(*wid_src)), return_tensors='pt', max_length=tokenizer.max_len )['input_ids'], tokenizer.prepare_for_model( list(itertools.chain(*wid_tgt)), return_tensors='pt', max_length=tokenizer.max_len)['input_ids'] bpe2word_map_src = [] for i, word_list in enumerate(token_src): bpe2word_map_src += [i for x in word_list] bpe2word_map_tgt = [] for i, word_list in enumerate(token_tgt): bpe2word_map_tgt += [i for x in word_list] self.examples.append( (ids_src, ids_tgt, bpe2word_map_src, bpe2word_map_tgt)) if args.cache_data: logger.info("Saving cached data to %s", cache_fn) torch.save(self.examples, cache_fn)
def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args, langid_mask=None, lang_id=None) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ if tokenizer.mask_token is None: raise ValueError( "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." ) labels = inputs.clone() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, args.mlm_probability) special_tokens_mask = [ tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.uint8), value=0.0) if tokenizer._pad_token is not None: padding_mask = labels.eq(tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) if langid_mask is not None: padding_mask = langid_mask.eq(lang_id) probability_matrix.masked_fill_(padding_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix).byte() labels[~masked_indices] = -100 # We only compute loss on masked tokens # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).byte() & masked_indices inputs[indices_replaced] = tokenizer.convert_tokens_to_ids( tokenizer.mask_token) # 10% of the time, we replace masked input tokens with random word indices_random = torch.bernoulli(torch.full( labels.shape, 0.5)).byte() & masked_indices & ~indices_replaced random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long) inputs[indices_random] = random_words[indices_random] # The rest of the time (10% of the time) we keep the masked input tokens unchanged return inputs, labels
def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path): assert os.path.isfile(file_path) print('Loading the dataset...') self.examples = [] with open(file_path, encoding="utf-8") as f: for idx, line in enumerate(f.readlines()): if len(line) == 0 or line.isspace() or not len( line.split(' ||| ')) == 2: raise ValueError( f'Line {idx+1} is not in the correct format!') src, tgt = line.split(' ||| ') if src.rstrip() == '' or tgt.rstrip() == '': raise ValueError( f'Line {idx+1} is not in the correct format!') sent_src, sent_tgt = src.strip().split(), tgt.strip().split() token_src, token_tgt = [ tokenizer.tokenize(word) for word in sent_src ], [tokenizer.tokenize(word) for word in sent_tgt] wid_src, wid_tgt = [ tokenizer.convert_tokens_to_ids(x) for x in token_src ], [tokenizer.convert_tokens_to_ids(x) for x in token_tgt] ids_src, ids_tgt = tokenizer.prepare_for_model( list(itertools.chain(*wid_src)), return_tensors='pt', max_length=tokenizer.max_len )['input_ids'], tokenizer.prepare_for_model( list(itertools.chain(*wid_tgt)), return_tensors='pt', max_length=tokenizer.max_len)['input_ids'] if len(ids_src[0]) == 2 or len(ids_tgt[0]) == 2: raise ValueError( f'Line {idx+1} is not in the correct format!') bpe2word_map_src = [] for i, word_list in enumerate(token_src): bpe2word_map_src += [i for x in word_list] bpe2word_map_tgt = [] for i, word_list in enumerate(token_tgt): bpe2word_map_tgt += [i for x in word_list] self.examples.append((ids_src[0], ids_tgt[0], bpe2word_map_src, bpe2word_map_tgt))