def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair(args.data[0]) if args.source_lang is None or args.target_lang is None: raise Exception('Could not infer language pair, please provide it explicitly') # load dictionaries src_dict = BertDictionary.load(os.path.join(args.data[0], 'dict.{}.txt'.format(args.source_lang))) src_eos_idx = src_dict.add_special_token('[END_OF_SENT]') print('src_dict:[END_OF_SENT] id = {}, token = {}'.format(src_eos_idx, src_dict[src_eos_idx])) tgt_dict = BertDictionary.load(os.path.join(args.data[0], 'dict.{}.txt'.format(args.target_lang))) tgt_eos_idx = tgt_dict.add_special_token('[END_OF_SENT]') print('tgt_dict:[END_OF_SENT] id = {}, token = {}'.format(tgt_eos_idx, tgt_dict[tgt_eos_idx])) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() assert src_dict.sep() == tgt_dict.sep() print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load google bert dictionaries).""" dict_path = os.path.join(args.data, "vocab.txt") if not os.path.isfile(dict_path): raise FileNotFoundError("Dict not found: {}".format(dict_path)) from transformers import BertTokenizer tokenizer = BertTokenizer.from_pretrained(args.bert_name) tgt_dict = BertDictionary.load(dict_path, tokenizer) print("| dictionary: {} types".format(len(tgt_dict))) return cls(args, tgt_dict)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad = options.eval_bool(args.left_pad) # load dictionaries dictionary = BertDictionary.load(os.path.join(args.data[0], 'dict.txt')) print('| dictionary: {} types'.format(len(dictionary))) return cls(args, dictionary)