示例#1
0
 def build_bpe(self, cfg: Union[DictConfig, Namespace]):
     if ((isinstance(cfg, DictConfig) and cfg._name == "characters_asr")
             or (isinstance(cfg, Namespace)
                 and getattr(cfg, "bpe", None) == "characters_asr")):
         self.bpe = encoders.build_bpe(cfg,
                                       space_symbol=self.space_word,
                                       non_lang_syms=self.non_lang_syms)
     else:
         self.bpe = encoders.build_bpe(cfg)
示例#2
0
 def build_bpe(self, args):
     if args.bpe == "characters_asr":
         self.bpe = encoders.build_bpe(
             args,
             space_symbol=self.space_word,
             ends_with_space=True,
             non_lang_syms=self.non_lang_syms,
         )
     else:
         self.bpe = encoders.build_bpe(args)
示例#3
0
    def __init__(self, cfg, task, models):
        super().__init__()
        self.cfg = cfg
        self.task = task
        self.models = nn.ModuleList(models)
        self.src_dict = task.source_dictionary
        self.tgt_dict = task.target_dictionary

        # optimize model for generation
        for model in self.models:
            model.prepare_for_inference_(cfg)

        # Load alignment dictionary for unknown word replacement
        # (None if no unknown word replacement, empty if no path to align dictionary)
        self.align_dict = utils.load_align_dict(cfg.generation.replace_unk)

        self.tokenizer = encoders.build_tokenizer(cfg.tokenizer)
        self.bpe = encoders.build_bpe(cfg.bpe)

        self.max_positions = utils.resolve_max_positions(
            self.task.max_positions(), *[model.max_positions() for model in models]
        )

        # this is useful for determining the device
        self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float))
    def _get_whole_word_mask(self):
        # create masked input and targets
        if self.args.mask_whole_words:
            bpe = encoders.build_bpe(self.args)
            if bpe is not None:

                def is_beginning_of_word(i):
                    if i < self.source_dictionary.nspecial:
                        # special elements are always considered beginnings
                        return True
                    tok = self.source_dictionary[i]
                    if tok.startswith('madeupword'):
                        return True
                    try:
                        return bpe.is_beginning_of_word(tok)
                    except ValueError:
                        return True

                mask_whole_words = torch.ByteTensor(
                    list(
                        map(is_beginning_of_word,
                            range(len(self.source_dictionary)))))
        else:
            mask_whole_words = None
        return mask_whole_words
示例#5
0
 def __init__(self, args, task):
     super().__init__(args, task)
     self.eps = args.label_smoothing
     #to print target and generated output
     self.task = task
     args.bpe = 'gpt2'
     self.bpe = encoders.build_bpe(args)
示例#6
0
    def __init__(self, args, task, models):
        super().__init__()
        self.args = args
        self.task = task
        self.models = nn.ModuleList(models)
        self.src_dict = task.source_dictionary
        self.tgt_dict = task.target_dictionary

        # optimize model for generation
        for model in self.models:
            model.make_generation_fast_(
                beamable_mm_beam_size=(None if getattr(args, 'no_beamable_mm',
                                                       False) else getattr(
                                                           args, 'beam', 5)),
                need_attn=getattr(args, 'print_alignment', False),
            )

        # Load alignment dictionary for unknown word replacement
        # (None if no unknown word replacement, empty if no path to align dictionary)
        self.align_dict = utils.load_align_dict(
            getattr(args, 'replace_unk', None))

        self.tokenizer = encoders.build_tokenizer(args)
        self.bpe = encoders.build_bpe(args)

        # this is useful for determining the device
        self.register_buffer('_float_tensor',
                             torch.tensor([0], dtype=torch.float))
示例#7
0
    def __init__(self, args, task, models):
        self.args = args
        self.task = task
        self.models = models
        self.src_dict = task.source_dictionary
        self.tgt_dict = task.target_dictionary
        self.use_cuda = torch.cuda.is_available() and not getattr(
            args, 'cpu', False)

        if self.use_cuda:
            if getattr(args, 'fp16', False):
                self.half()
            self.cuda()

        # optimize model for generation
        for model in self.models:
            model.make_generation_fast_(
                beamable_mm_beam_size=(None if getattr(args, 'no_beamable_mm',
                                                       False) else getattr(
                                                           args, 'beam', 5)),
                need_attn=getattr(args, 'print_alignment', False),
            )

        self.generator = self.task.build_generator(args)

        # Load alignment dictionary for unknown word replacement
        # (None if no unknown word replacement, empty if no path to align dictionary)
        self.align_dict = utils.load_align_dict(
            getattr(args, 'replace_unk', None))

        self.tokenizer = encoders.build_tokenizer(args)
        self.bpe = encoders.build_bpe(args)
示例#8
0
 def __init__(self, args, task):
     super().__init__(args, task)
     if self.args.save_predictions is not None:
         self.prediction_h = open(self.args.save_predictions, 'w')
     else:
         self.prediction_h = None
     self.bpe = encoders.build_bpe(args)
     self.tokenizer = encoders.build_tokenizer(args)
示例#9
0
    def __init__(self, args, task, model):
        super().__init__()
        self.args = args
        self.task = task
        self.model = model

        self.bpe = encoders.build_bpe(args)

        # this is useful for determining the device
        self.register_buffer('_float_tensor', torch.tensor([0], dtype=torch.float))
 def __init__(self, args):
     self.args = args
     self._load_args(self.args)
     self.bpe = encoders.build_bpe(self.args)
     self.task = tasks.setup_task(self.args)
     self.tokenizer = SpmTokenizer(self.args.spm)
     self.align_dict = utils.load_align_dict(self.args.replace_unk)
     self.models = self._load_models(self.args)
     self.max_positions = self._load_max_positions(self.models, self.task)
     self.generator = self.task.build_generator(self.models, self.args)
示例#11
0
    def __init__(self, cfg, task, model):
        super().__init__()
        self.cfg = cfg
        self.task = task
        self.model = model

        self.bpe = encoders.build_bpe(cfg.bpe)

        # this is useful for determining the device
        self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float))
示例#12
0
 def __init__(self, args, vocab):
     super().__init__(args)
     self.vocab = vocab 
     if getattr(args, 'bert', False):
         self.mask = vocab.mask_index
         self.bpe = BertTokenizer.from_pretrained('bert-base-uncased')
         self.tokenizer = self.bpe
         print('| bert bpe')
     else:
         self.mask = vocab.add_symbol('<mask>')
         self.bpe = encoders.build_bpe(args)
    def __init__(self, args, task):
        super().__init__(task = task)
        self.args = args

        self.generator = SimpleSequenceGenerator(beam=args.scst_beam,
                                                 penalty=args.scst_penalty,
                                                 max_pos=args.max_target_positions,
                                                 eos_index=task.target_dictionary.eos_index)

        # Needed for decoding model output to string
        self.conf_tokenizer = encoders.build_tokenizer(args)
        self.conf_decoder = encoders.build_bpe(args)
        self.target_dict = task.target_dictionary

        # Tokenizer needed for computing CIDEr scores
        self.tokenizer = encoders.build_tokenizer(args)
        self.bpe = encoders.build_bpe(args)
 
        self.scorer = bleu.SacrebleuScorer()

        self.pad_idx = task.target_dictionary.pad()
示例#14
0
文件: utils.py 项目: mingxuan/mRASP
def get_hidden_states(task, model, args):
    src_dict = getattr(task, 'source_dictionary', None)
    tgt_dict = task.target_dictionary

    # Handle tokenization and BPE
    tokenizer = encoders.build_tokenizer(args)
    bpe = encoders.build_bpe(args)

    def decode_fn(x):
        if bpe is not None:
            x = bpe.decode(x)
        if tokenizer is not None:
            x = tokenizer.decode(x)
        return x

    def toks_2_sent(toks):
        _str = tgt_dict.string(toks, args.remove_bpe)
        _sent = decode_fn(_str)
        return _sent

    itr = task.get_batch_iterator(
        dataset=task.dataset(args.gen_subset),
        max_tokens=args.max_tokens,
        max_sentences=args.batch_size,
        max_positions=utils.resolve_max_positions(task.max_positions(),
                                                  model.max_positions()),
        ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test,
        required_batch_size_multiple=args.required_batch_size_multiple,
        num_shards=args.num_shards,
        shard_id=args.shard_id,
        num_workers=args.num_workers,
        # data_buffer_size=args.data_buffer_size,
    ).next_epoch_itr(shuffle=False)

    # initialize
    src_sentences = []
    src_hidden_states_list = []
    idx_list = []

    for sample in tqdm(itr):
        sample = utils.move_to_cuda(sample)
        src_avg_states = get_avg(sample["net_input"]["src_tokens"],
                                 sample["net_input"]["src_lengths"], model,
                                 False)
        src_hidden_states_list.extend(src_avg_states)
        idx_list.extend(sample["id"].detach().cpu().numpy())
        for i, sample_id in enumerate(sample['id'].tolist()):
            src_tokens_i = utils.strip_pad(
                sample['net_input']['src_tokens'][i, :], src_dict.pad())
            src_sent_i = toks_2_sent(src_tokens_i)
            src_sentences.append(src_sent_i)
    return src_sentences, src_hidden_states_list, idx_list
 def __init__(self, args, task):
     super().__init__(args, task)
     self.eps = args.label_smoothing
     self.task = task
     self.debugCount = 0
     args.bpe = 'gpt2'
     self.bpe = encoders.build_bpe(args)
     """
     if args.rewarderpath == None:
         args.rewarderpath = "./semsim/trained_models/" + args.restore_file.split('/')[-1] # TODO : refactoring required
         print("args.rewarderpath not set : use %s instead."%args.rewarderpath) """
     args.rewarderpath = "./semsim/trained_models/sample.model"  #TODO
     self.rewarder = Rewarder(args.rewarderpath)
     self.loss_weight = args.loss_weight
示例#16
0
    def __init__(self, args, task, model):
        super().__init__()
        args.gpt2_encoder_json = PATH_TO_GPT2BPE + "/encoder.json"
        args.gpt2_vocab_bpe = PATH_TO_GPT2BPE + "/vocab.bpe"
        self.args = args
        self.task = task
        self.model = model

        self.bpe = encoders.build_bpe(args)
        self.max_positions = 1024

        # this is useful for determining the device
        self.register_buffer("_float_tensor",
                             torch.tensor([0], dtype=torch.float))
示例#17
0
文件: squad2.py 项目: yf1291/nlp4
    def __init__(self, args, dictionary):
        super().__init__(args)

        self.dictionary = dictionary
        self.seed = args.seed
        self.bpe = encoders.build_bpe(args)
        self.tokenizer = SQuADTokenizer(args.bpe_vocab_file, dictionary)
        self.do_evaluate = args.do_evaluate
        try:
            from transformers.data.processors.squad import SquadV2Processor
            self.processor = SquadV2Processor()
        except ImportError:
            raise ImportError(
                'Please install transformers with: pip install transformers')
示例#18
0
def main(args):
    utils.import_user_module(args)

    if args.buffer_size < 1:
        args.buffer_size = 1
    if args.max_tokens is None and args.max_sentences is None:
        args.max_sentences = 1

    assert not args.sampling or args.nbest == args.beam, \
        '--sampling requires --nbest to be equal to --beam'
    assert not args.max_sentences or args.max_sentences <= args.buffer_size, \
        '--max-sentences/--batch-size cannot be larger than --buffer-size'

    logger.info(args)

    use_cuda = torch.cuda.is_available() and not args.cpu

    # Setup task, e.g., translation
    task = tasks.setup_task(args)

    # Load ensemble
    logger.info('loading model(s) from {}'.format(args.path))
    models, _model_args = checkpoint_utils.load_model_ensemble(
        args.path.split(os.pathsep),
        arg_overrides=eval(args.model_overrides),
        task=task,
    )

    # Set dictionaries
    src_dict = task.source_dictionary
    tgt_dict = task.target_dictionary

    # Optimize ensemble for generation
    for model in models:
        model.make_generation_fast_(
            beamable_mm_beam_size=None if args.no_beamable_mm else args.beam,
            need_attn=args.print_alignment,
        )
        if args.fp16:
            model.half()
        if use_cuda:
            model.cuda()

    # Initialize generator
    generator = task.build_generator(args)

    # Handle tokenization and BPE
    tokenizer = encoders.build_tokenizer(args)
    bpe = encoders.build_bpe(args)
    return tokenizer, bpe, args, task, models, use_cuda, generator, tgt_dict, src_dict
示例#19
0
    def __init__(self, args, vocab):
        super().__init__(args)
        self.vocab = vocab
        self.mask = vocab.add_symbol("<mask>")

        self.bpe = encoders.build_bpe(args)
        self.tokenizer = encoders.build_tokenizer(args)

        # hack to handle GPT-2 BPE, which includes leading spaces
        if args.bpe == "gpt2":
            self.leading_space = True
            self.trailing_space = False
        else:
            self.leading_space = False
            self.trailing_space = True
    def __init__(self, args, task, model):
        super().__init__()
        self.args = args
        self.task = task
        self.model = model

        self.bpe = encoders.build_bpe(args)

        self.max_positions = min(utils.resolve_max_positions(
            self.task.max_positions(),
            self.model.max_positions(),
        ))

        # this is useful for determining the device
        self.register_buffer('_float_tensor', torch.tensor([0], dtype=torch.float))
示例#21
0
 def __init__(self,
              dataset: HiveDataset,
              dictionary,
              split_range: Tuple[float, float] = (0.0, 1.0)):
     super().__init__()
     self.dataset = dataset
     self.dictionary = dictionary
     self.split_range = split_range
     self.bpe = encoders.build_bpe(
         argparse.Namespace(
             bpe='gpt2',
             gpt2_encoder_json=
             '/mnt/vol/gfsai-flash3-east/ai-group/users/myleott/gpt2_bpe/encoder.json',
             gpt2_vocab_bpe=
             '/mnt/vol/gfsai-flash3-east/ai-group/users/myleott/gpt2_bpe/vocab.bpe'
         ))
示例#22
0
    def __init__(self, model_path):
        parser = argparse.ArgumentParser()
        self.add_args(parser)
        args = parser.parse_args()
        print(args)

        #build bpe
        self.bpe = encoders.build_bpe(args)
        #some parameters
        self.bos = "<s>"
        self.eos = "<\s>"
        self.unk = "<unk>"
        self.pad = "<pad>"
        #load dictionary of bpe tokens
        with open(os.path.join(model_path, "dict.txt"), "r",
                  encoding="utf-8") as f:
            self.dictionary = OnmtDictionary.load(f)
示例#23
0
def load_bart_decoder(path):
    """Loads a source side word map from the file system.
    
    Args:
        path (string): Path to the word map (Format: word id)
    
    Returns:
        dict. Source word map (key: word, value: id)
    """

    from fairseq import options
    from fairseq.data import encoders
    input_args = ['--path', path, os.path.dirname(path), '--bpe', 'gpt2']
    parser = options.get_generation_parser()
    args = options.parse_args_and_arch(parser, input_args)

    global bart
    bart = encoders.build_bpe(args)
示例#24
0
文件: utils.py 项目: verashira/TSPNet
def get_whole_word_mask(args, dictionary):
    bpe = encoders.build_bpe(args)
    if bpe is not None:
        def is_beginning_of_word(i):
            if i < dictionary.nspecial:
                # special elements are always considered beginnings
                return True
            tok = dictionary[i]
            if tok.startswith('madeupword'):
                return True
            try:
                return bpe.is_beginning_of_word(tok)
            except ValueError:
                return True
        mask_whole_words = torch.ByteTensor(list(
            map(is_beginning_of_word, range(len(dictionary)))
        ))
        return mask_whole_words
    return None
示例#25
0
    def __init__(self, tokenizer_path):
        super().__init__()
        self.dict = Dictionary.load(os.path.join(tokenizer_path, 'dict.txt'))
        # <sep> and <pad> already exist in the dictionary
        self.index_special_tokens = {
            tok: self.dict.add_symbol(tok)
            for tok in special_tokens
        }

        args = Namespace(bpe='sentencepiece',
                         sample_break_mode='complete',
                         sentencepiece_vocab=os.path.join(
                             tokenizer_path, 'sentencepiece.bpe.model'))
        self.bpe = encoders.build_bpe(args)

        # this is useful for determining the device
        self.register_buffer('_float_tensor',
                             torch.tensor([0], dtype=torch.float))
        self.info = 'fairseq'
    def generate_batched_itr(
        self, data_itr, beam_size=None, maxlen_a=0.0, maxlen_b=None,
        cuda=False, timer=None, prefix_size=0, src_pt=None, trg_pt=None,
        tgt_dict=None,
    ):
        """Iterate over a batched dataset and yield individual translations.

        Args:
            maxlen_a/b (int, optional): generate sequences of maximum length
                ``ax + b``, where ``x`` is the source sentence length.
            cuda (bool, optional): use GPU for generation
            timer (StopwatchMeter, optional): time generations
            prefix_size (int, optional): prefill the generation with the gold
                prefix up to this length.
        """
        if maxlen_b is None:
            maxlen_b = self.maxlen
        
        if self.args.cons_type == 'raw':
            tokenizer = encoders.build_tokenizer(self.args)
            bpe = encoders.build_bpe(self.args)
        else:
            tokenizer, bpe = None, None 
        # import pdb;pdb.set_trace()
        for sample in data_itr:
            s = utils.move_to_cuda(sample) if cuda else sample
            if 'net_input' not in s:
                continue
            input = s['net_input']
            encoder_input = {
                k: v for k, v in input.items()
                if k != 'prev_output_tokens'
            }
            srclen = encoder_input['src_tokens'].size(1)
            with torch.no_grad():
                sample_id = str(sample['id'].cpu().item())
                src_tokens = encoder_input['src_tokens']
                if timer is not None:
                    timer.start()
                hypo = self.npad_decode(encoder_input,maxlen_r=2.0,beam_size=beam_size)
                if timer is not None:
                    timer.stop(hypo['tokens'].size(0)-1)
                yield sample['id'], encoder_input['src_tokens'], sample['target'], hypo['tokens']
示例#27
0
def cal_bleu(samples, task, args):

    tokenizer = encoders.build_tokenizer(args)
    bpe = encoders.build_bpe(args)

    def decode_fn(x):
        return (x + ' ').replace('@@ ', '').rstrip()

    tgt_dict = task.target_dictionary

    target_tensor = samples['target']
    cand_tensor = samples['cand']
    batch_bleu = []
    assert len(target_tensor) == len(cand_tensor)

    for i in range(len(target_tensor)):
        tgt_tokens = utils.strip_pad(target_tensor[i],
                                     tgt_dict.pad()).int().cpu()
        cand_tokens = utils.strip_pad(cand_tensor[i],
                                      tgt_dict.pad()).int().cpu()

        tgt_str = tgt_dict.string(tgt_tokens,
                                  None,
                                  escape_unk=True,
                                  extra_symbols_to_ignore={tgt_dict.eos()})
        tgt_str = decode_fn(tgt_str)

        cand_str = tgt_dict.string(cand_tokens,
                                   None,
                                   escape_unk=True,
                                   extra_symbols_to_ignore={tgt_dict.eos()})
        cand_str = decode_fn(cand_str)

        bleuscore = sacrebleu.corpus_bleu([cand_str], [[tgt_str]],
                                          use_effective_order=True)
        batch_bleu.append(bleuscore.score)

    samples['bleu'] = batch_bleu

    assert len(batch_bleu) == len(target_tensor)

    return samples
示例#28
0
def get_word_beginnings(args: argparse.Namespace, dictionary: Dictionary) -> Optional[Dict[int, int]]:
    bpe = encoders.build_bpe(args)
    if bpe is not None:

        def is_beginning_of_word(i):
            if i < dictionary.nspecial:
                return True
            tok = dictionary[i]
            if tok.startswith("madeupword"):
                return True
            try:
                return bpe.is_beginning_of_word(tok)
            except ValueError:
                return True

        is_word_initial = {}
        for i in range(len(dictionary)):
            is_word_initial[i] = int(is_beginning_of_word(i))
        return is_word_initial
    return None
    def __init__(
        self,
        args: argparse.Namespace,
        src_dict: Dictionary,
        tgt_dict: Dictionary,
    ):
        super().__init__(args, src_dict, tgt_dict)  # type: ignore
        config = GlossaryTaskConfig.from_args(args)
        if config.enabled:
            logger.info("Glossary is ENABLED")
            logger.info(f"Glossary config: {config}")
        else:
            logger.info("Glossary is DISABLED")
        self.glossary_task_config = config
        # Ensure that <sep> and <c> are defined in the dictionaries.
        ensure_symbols_are_present(
            self.source_dictionary,
            ["<c>", "<sep>"],
            self.glossary_task_config.ok_to_increase_dict_size,
        )
        ensure_symbols_are_present(
            self.target_dictionary,
            ["<c>", "<sep>"],
            self.glossary_task_config.ok_to_increase_dict_size,
        )
        assert (
            self.target_dictionary == self.source_dictionary
        ), "The target dictionary must be the same as the source dictionary, \
    because we use is_word_initial based on a single dictionary and use it for both src and tgt."

        is_word_initial = get_word_beginnings(args, self.source_dictionary)
        if is_word_initial is None:
            raise ValueError("The is_word_initial function is None.")
        self.is_word_initial = is_word_initial

        apply_monkey_patch_for_make_positions(
            positional_marker_symbol_idx=self.source_dictionary.index("<sep>"),
            positional_idx_restart_offset=self.glossary_task_config.
            constraint_positional_start_idx,
        )
        self.bpe = encoders.build_bpe(args)
def extractVocab(model_path=post_rec.RoBertaBase):
    ckpt = torch.load(os.path.join(model_path, "model.pt"), map_location='cpu')
    args = ckpt["args"]
    for file, arg in {
            'code': 'bpe_codes',
            'bpecodes': 'bpe_codes',
            'sentencepiece.bpe.model': 'sentencepiece_vocab',
    }.items():
        path = os.path.join(model_path, file)
        if os.path.exists(path):
            # kwargs[arg] = path
            setattr(args, arg, path)

    setattr(args, "bpe", "gpt2")
    bpe = encoders.build_bpe(args)

    with open(os.path.join(model_path, "dict.txt"), "r") as f:
        dictionary = OnmtDictionary.load(f)

    vocab = []
    for word in dictionary.symbols:
        try:
            if word in dictionary.symbols[:dictionary.nspecial]:
                vocab.append(word)
                print("sp tokens:", word)
                continue
            print(word, "--->", bpe.decode(word), "--->",
                  dictionary.index(word))
            vocab.append(word)
        except:
            print("decoding error, append to vocab directly")
            vocab.append(word)
    print(len(vocab), vocab[:5], vocab[-5:])

    with open(os.path.join(
            model_path,
            "vocab.txt",
    ), "w", encoding="utf-8") as f:
        f.writelines(map(lambda w: w + "\n", vocab))
        print("**" * 20)
        print("wirte vocab.txt")