def build_bpe(self, cfg: Union[DictConfig, Namespace]): if ((isinstance(cfg, DictConfig) and cfg._name == "characters_asr") or (isinstance(cfg, Namespace) and getattr(cfg, "bpe", None) == "characters_asr")): self.bpe = encoders.build_bpe(cfg, space_symbol=self.space_word, non_lang_syms=self.non_lang_syms) else: self.bpe = encoders.build_bpe(cfg)
def build_bpe(self, args): if args.bpe == "characters_asr": self.bpe = encoders.build_bpe( args, space_symbol=self.space_word, ends_with_space=True, non_lang_syms=self.non_lang_syms, ) else: self.bpe = encoders.build_bpe(args)
def __init__(self, cfg, task, models): super().__init__() self.cfg = cfg self.task = task self.models = nn.ModuleList(models) self.src_dict = task.source_dictionary self.tgt_dict = task.target_dictionary # optimize model for generation for model in self.models: model.prepare_for_inference_(cfg) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) self.align_dict = utils.load_align_dict(cfg.generation.replace_unk) self.tokenizer = encoders.build_tokenizer(cfg.tokenizer) self.bpe = encoders.build_bpe(cfg.bpe) self.max_positions = utils.resolve_max_positions( self.task.max_positions(), *[model.max_positions() for model in models] ) # this is useful for determining the device self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float))
def _get_whole_word_mask(self): # create masked input and targets if self.args.mask_whole_words: bpe = encoders.build_bpe(self.args) if bpe is not None: def is_beginning_of_word(i): if i < self.source_dictionary.nspecial: # special elements are always considered beginnings return True tok = self.source_dictionary[i] if tok.startswith('madeupword'): return True try: return bpe.is_beginning_of_word(tok) except ValueError: return True mask_whole_words = torch.ByteTensor( list( map(is_beginning_of_word, range(len(self.source_dictionary))))) else: mask_whole_words = None return mask_whole_words
def __init__(self, args, task): super().__init__(args, task) self.eps = args.label_smoothing #to print target and generated output self.task = task args.bpe = 'gpt2' self.bpe = encoders.build_bpe(args)
def __init__(self, args, task, models): super().__init__() self.args = args self.task = task self.models = nn.ModuleList(models) self.src_dict = task.source_dictionary self.tgt_dict = task.target_dictionary # optimize model for generation for model in self.models: model.make_generation_fast_( beamable_mm_beam_size=(None if getattr(args, 'no_beamable_mm', False) else getattr( args, 'beam', 5)), need_attn=getattr(args, 'print_alignment', False), ) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) self.align_dict = utils.load_align_dict( getattr(args, 'replace_unk', None)) self.tokenizer = encoders.build_tokenizer(args) self.bpe = encoders.build_bpe(args) # this is useful for determining the device self.register_buffer('_float_tensor', torch.tensor([0], dtype=torch.float))
def __init__(self, args, task, models): self.args = args self.task = task self.models = models self.src_dict = task.source_dictionary self.tgt_dict = task.target_dictionary self.use_cuda = torch.cuda.is_available() and not getattr( args, 'cpu', False) if self.use_cuda: if getattr(args, 'fp16', False): self.half() self.cuda() # optimize model for generation for model in self.models: model.make_generation_fast_( beamable_mm_beam_size=(None if getattr(args, 'no_beamable_mm', False) else getattr( args, 'beam', 5)), need_attn=getattr(args, 'print_alignment', False), ) self.generator = self.task.build_generator(args) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) self.align_dict = utils.load_align_dict( getattr(args, 'replace_unk', None)) self.tokenizer = encoders.build_tokenizer(args) self.bpe = encoders.build_bpe(args)
def __init__(self, args, task): super().__init__(args, task) if self.args.save_predictions is not None: self.prediction_h = open(self.args.save_predictions, 'w') else: self.prediction_h = None self.bpe = encoders.build_bpe(args) self.tokenizer = encoders.build_tokenizer(args)
def __init__(self, args, task, model): super().__init__() self.args = args self.task = task self.model = model self.bpe = encoders.build_bpe(args) # this is useful for determining the device self.register_buffer('_float_tensor', torch.tensor([0], dtype=torch.float))
def __init__(self, args): self.args = args self._load_args(self.args) self.bpe = encoders.build_bpe(self.args) self.task = tasks.setup_task(self.args) self.tokenizer = SpmTokenizer(self.args.spm) self.align_dict = utils.load_align_dict(self.args.replace_unk) self.models = self._load_models(self.args) self.max_positions = self._load_max_positions(self.models, self.task) self.generator = self.task.build_generator(self.models, self.args)
def __init__(self, cfg, task, model): super().__init__() self.cfg = cfg self.task = task self.model = model self.bpe = encoders.build_bpe(cfg.bpe) # this is useful for determining the device self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float))
def __init__(self, args, vocab): super().__init__(args) self.vocab = vocab if getattr(args, 'bert', False): self.mask = vocab.mask_index self.bpe = BertTokenizer.from_pretrained('bert-base-uncased') self.tokenizer = self.bpe print('| bert bpe') else: self.mask = vocab.add_symbol('<mask>') self.bpe = encoders.build_bpe(args)
def __init__(self, args, task): super().__init__(task = task) self.args = args self.generator = SimpleSequenceGenerator(beam=args.scst_beam, penalty=args.scst_penalty, max_pos=args.max_target_positions, eos_index=task.target_dictionary.eos_index) # Needed for decoding model output to string self.conf_tokenizer = encoders.build_tokenizer(args) self.conf_decoder = encoders.build_bpe(args) self.target_dict = task.target_dictionary # Tokenizer needed for computing CIDEr scores self.tokenizer = encoders.build_tokenizer(args) self.bpe = encoders.build_bpe(args) self.scorer = bleu.SacrebleuScorer() self.pad_idx = task.target_dictionary.pad()
def get_hidden_states(task, model, args): src_dict = getattr(task, 'source_dictionary', None) tgt_dict = task.target_dictionary # Handle tokenization and BPE tokenizer = encoders.build_tokenizer(args) bpe = encoders.build_bpe(args) def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x def toks_2_sent(toks): _str = tgt_dict.string(toks, args.remove_bpe) _sent = decode_fn(_str) return _sent itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.batch_size, max_positions=utils.resolve_max_positions(task.max_positions(), model.max_positions()), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, # data_buffer_size=args.data_buffer_size, ).next_epoch_itr(shuffle=False) # initialize src_sentences = [] src_hidden_states_list = [] idx_list = [] for sample in tqdm(itr): sample = utils.move_to_cuda(sample) src_avg_states = get_avg(sample["net_input"]["src_tokens"], sample["net_input"]["src_lengths"], model, False) src_hidden_states_list.extend(src_avg_states) idx_list.extend(sample["id"].detach().cpu().numpy()) for i, sample_id in enumerate(sample['id'].tolist()): src_tokens_i = utils.strip_pad( sample['net_input']['src_tokens'][i, :], src_dict.pad()) src_sent_i = toks_2_sent(src_tokens_i) src_sentences.append(src_sent_i) return src_sentences, src_hidden_states_list, idx_list
def __init__(self, args, task): super().__init__(args, task) self.eps = args.label_smoothing self.task = task self.debugCount = 0 args.bpe = 'gpt2' self.bpe = encoders.build_bpe(args) """ if args.rewarderpath == None: args.rewarderpath = "./semsim/trained_models/" + args.restore_file.split('/')[-1] # TODO : refactoring required print("args.rewarderpath not set : use %s instead."%args.rewarderpath) """ args.rewarderpath = "./semsim/trained_models/sample.model" #TODO self.rewarder = Rewarder(args.rewarderpath) self.loss_weight = args.loss_weight
def __init__(self, args, task, model): super().__init__() args.gpt2_encoder_json = PATH_TO_GPT2BPE + "/encoder.json" args.gpt2_vocab_bpe = PATH_TO_GPT2BPE + "/vocab.bpe" self.args = args self.task = task self.model = model self.bpe = encoders.build_bpe(args) self.max_positions = 1024 # this is useful for determining the device self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float))
def __init__(self, args, dictionary): super().__init__(args) self.dictionary = dictionary self.seed = args.seed self.bpe = encoders.build_bpe(args) self.tokenizer = SQuADTokenizer(args.bpe_vocab_file, dictionary) self.do_evaluate = args.do_evaluate try: from transformers.data.processors.squad import SquadV2Processor self.processor = SquadV2Processor() except ImportError: raise ImportError( 'Please install transformers with: pip install transformers')
def main(args): utils.import_user_module(args) if args.buffer_size < 1: args.buffer_size = 1 if args.max_tokens is None and args.max_sentences is None: args.max_sentences = 1 assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert not args.max_sentences or args.max_sentences <= args.buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu # Setup task, e.g., translation task = tasks.setup_task(args) # Load ensemble logger.info('loading model(s) from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(os.pathsep), arg_overrides=eval(args.model_overrides), task=task, ) # Set dictionaries src_dict = task.source_dictionary tgt_dict = task.target_dictionary # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Initialize generator generator = task.build_generator(args) # Handle tokenization and BPE tokenizer = encoders.build_tokenizer(args) bpe = encoders.build_bpe(args) return tokenizer, bpe, args, task, models, use_cuda, generator, tgt_dict, src_dict
def __init__(self, args, vocab): super().__init__(args) self.vocab = vocab self.mask = vocab.add_symbol("<mask>") self.bpe = encoders.build_bpe(args) self.tokenizer = encoders.build_tokenizer(args) # hack to handle GPT-2 BPE, which includes leading spaces if args.bpe == "gpt2": self.leading_space = True self.trailing_space = False else: self.leading_space = False self.trailing_space = True
def __init__(self, args, task, model): super().__init__() self.args = args self.task = task self.model = model self.bpe = encoders.build_bpe(args) self.max_positions = min(utils.resolve_max_positions( self.task.max_positions(), self.model.max_positions(), )) # this is useful for determining the device self.register_buffer('_float_tensor', torch.tensor([0], dtype=torch.float))
def __init__(self, dataset: HiveDataset, dictionary, split_range: Tuple[float, float] = (0.0, 1.0)): super().__init__() self.dataset = dataset self.dictionary = dictionary self.split_range = split_range self.bpe = encoders.build_bpe( argparse.Namespace( bpe='gpt2', gpt2_encoder_json= '/mnt/vol/gfsai-flash3-east/ai-group/users/myleott/gpt2_bpe/encoder.json', gpt2_vocab_bpe= '/mnt/vol/gfsai-flash3-east/ai-group/users/myleott/gpt2_bpe/vocab.bpe' ))
def __init__(self, model_path): parser = argparse.ArgumentParser() self.add_args(parser) args = parser.parse_args() print(args) #build bpe self.bpe = encoders.build_bpe(args) #some parameters self.bos = "<s>" self.eos = "<\s>" self.unk = "<unk>" self.pad = "<pad>" #load dictionary of bpe tokens with open(os.path.join(model_path, "dict.txt"), "r", encoding="utf-8") as f: self.dictionary = OnmtDictionary.load(f)
def load_bart_decoder(path): """Loads a source side word map from the file system. Args: path (string): Path to the word map (Format: word id) Returns: dict. Source word map (key: word, value: id) """ from fairseq import options from fairseq.data import encoders input_args = ['--path', path, os.path.dirname(path), '--bpe', 'gpt2'] parser = options.get_generation_parser() args = options.parse_args_and_arch(parser, input_args) global bart bart = encoders.build_bpe(args)
def get_whole_word_mask(args, dictionary): bpe = encoders.build_bpe(args) if bpe is not None: def is_beginning_of_word(i): if i < dictionary.nspecial: # special elements are always considered beginnings return True tok = dictionary[i] if tok.startswith('madeupword'): return True try: return bpe.is_beginning_of_word(tok) except ValueError: return True mask_whole_words = torch.ByteTensor(list( map(is_beginning_of_word, range(len(dictionary))) )) return mask_whole_words return None
def __init__(self, tokenizer_path): super().__init__() self.dict = Dictionary.load(os.path.join(tokenizer_path, 'dict.txt')) # <sep> and <pad> already exist in the dictionary self.index_special_tokens = { tok: self.dict.add_symbol(tok) for tok in special_tokens } args = Namespace(bpe='sentencepiece', sample_break_mode='complete', sentencepiece_vocab=os.path.join( tokenizer_path, 'sentencepiece.bpe.model')) self.bpe = encoders.build_bpe(args) # this is useful for determining the device self.register_buffer('_float_tensor', torch.tensor([0], dtype=torch.float)) self.info = 'fairseq'
def generate_batched_itr( self, data_itr, beam_size=None, maxlen_a=0.0, maxlen_b=None, cuda=False, timer=None, prefix_size=0, src_pt=None, trg_pt=None, tgt_dict=None, ): """Iterate over a batched dataset and yield individual translations. Args: maxlen_a/b (int, optional): generate sequences of maximum length ``ax + b``, where ``x`` is the source sentence length. cuda (bool, optional): use GPU for generation timer (StopwatchMeter, optional): time generations prefix_size (int, optional): prefill the generation with the gold prefix up to this length. """ if maxlen_b is None: maxlen_b = self.maxlen if self.args.cons_type == 'raw': tokenizer = encoders.build_tokenizer(self.args) bpe = encoders.build_bpe(self.args) else: tokenizer, bpe = None, None # import pdb;pdb.set_trace() for sample in data_itr: s = utils.move_to_cuda(sample) if cuda else sample if 'net_input' not in s: continue input = s['net_input'] encoder_input = { k: v for k, v in input.items() if k != 'prev_output_tokens' } srclen = encoder_input['src_tokens'].size(1) with torch.no_grad(): sample_id = str(sample['id'].cpu().item()) src_tokens = encoder_input['src_tokens'] if timer is not None: timer.start() hypo = self.npad_decode(encoder_input,maxlen_r=2.0,beam_size=beam_size) if timer is not None: timer.stop(hypo['tokens'].size(0)-1) yield sample['id'], encoder_input['src_tokens'], sample['target'], hypo['tokens']
def cal_bleu(samples, task, args): tokenizer = encoders.build_tokenizer(args) bpe = encoders.build_bpe(args) def decode_fn(x): return (x + ' ').replace('@@ ', '').rstrip() tgt_dict = task.target_dictionary target_tensor = samples['target'] cand_tensor = samples['cand'] batch_bleu = [] assert len(target_tensor) == len(cand_tensor) for i in range(len(target_tensor)): tgt_tokens = utils.strip_pad(target_tensor[i], tgt_dict.pad()).int().cpu() cand_tokens = utils.strip_pad(cand_tensor[i], tgt_dict.pad()).int().cpu() tgt_str = tgt_dict.string(tgt_tokens, None, escape_unk=True, extra_symbols_to_ignore={tgt_dict.eos()}) tgt_str = decode_fn(tgt_str) cand_str = tgt_dict.string(cand_tokens, None, escape_unk=True, extra_symbols_to_ignore={tgt_dict.eos()}) cand_str = decode_fn(cand_str) bleuscore = sacrebleu.corpus_bleu([cand_str], [[tgt_str]], use_effective_order=True) batch_bleu.append(bleuscore.score) samples['bleu'] = batch_bleu assert len(batch_bleu) == len(target_tensor) return samples
def get_word_beginnings(args: argparse.Namespace, dictionary: Dictionary) -> Optional[Dict[int, int]]: bpe = encoders.build_bpe(args) if bpe is not None: def is_beginning_of_word(i): if i < dictionary.nspecial: return True tok = dictionary[i] if tok.startswith("madeupword"): return True try: return bpe.is_beginning_of_word(tok) except ValueError: return True is_word_initial = {} for i in range(len(dictionary)): is_word_initial[i] = int(is_beginning_of_word(i)) return is_word_initial return None
def __init__( self, args: argparse.Namespace, src_dict: Dictionary, tgt_dict: Dictionary, ): super().__init__(args, src_dict, tgt_dict) # type: ignore config = GlossaryTaskConfig.from_args(args) if config.enabled: logger.info("Glossary is ENABLED") logger.info(f"Glossary config: {config}") else: logger.info("Glossary is DISABLED") self.glossary_task_config = config # Ensure that <sep> and <c> are defined in the dictionaries. ensure_symbols_are_present( self.source_dictionary, ["<c>", "<sep>"], self.glossary_task_config.ok_to_increase_dict_size, ) ensure_symbols_are_present( self.target_dictionary, ["<c>", "<sep>"], self.glossary_task_config.ok_to_increase_dict_size, ) assert ( self.target_dictionary == self.source_dictionary ), "The target dictionary must be the same as the source dictionary, \ because we use is_word_initial based on a single dictionary and use it for both src and tgt." is_word_initial = get_word_beginnings(args, self.source_dictionary) if is_word_initial is None: raise ValueError("The is_word_initial function is None.") self.is_word_initial = is_word_initial apply_monkey_patch_for_make_positions( positional_marker_symbol_idx=self.source_dictionary.index("<sep>"), positional_idx_restart_offset=self.glossary_task_config. constraint_positional_start_idx, ) self.bpe = encoders.build_bpe(args)
def extractVocab(model_path=post_rec.RoBertaBase): ckpt = torch.load(os.path.join(model_path, "model.pt"), map_location='cpu') args = ckpt["args"] for file, arg in { 'code': 'bpe_codes', 'bpecodes': 'bpe_codes', 'sentencepiece.bpe.model': 'sentencepiece_vocab', }.items(): path = os.path.join(model_path, file) if os.path.exists(path): # kwargs[arg] = path setattr(args, arg, path) setattr(args, "bpe", "gpt2") bpe = encoders.build_bpe(args) with open(os.path.join(model_path, "dict.txt"), "r") as f: dictionary = OnmtDictionary.load(f) vocab = [] for word in dictionary.symbols: try: if word in dictionary.symbols[:dictionary.nspecial]: vocab.append(word) print("sp tokens:", word) continue print(word, "--->", bpe.decode(word), "--->", dictionary.index(word)) vocab.append(word) except: print("decoding error, append to vocab directly") vocab.append(word) print(len(vocab), vocab[:5], vocab[-5:]) with open(os.path.join( model_path, "vocab.txt", ), "w", encoding="utf-8") as f: f.writelines(map(lambda w: w + "\n", vocab)) print("**" * 20) print("wirte vocab.txt")