def setup_task(cls, args, **kwargs): args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) args.trigram_block = options.eval_bool(args.trigram_block) args.init_from_pretrained_doc_model = options.eval_bool(args.init_from_pretrained_doc_model) # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair(args.data) if args.source_lang is None or args.target_lang is None: raise Exception('Could not infer language pair, please provide it explicitly') # load dictionaries if args.pretrained_bert_model.startswith('roberta'): src_dict = GPT2Dictionary.load(os.path.join(args.data, 'dict.{}.txt'.format(args.source_lang))) else: src_dict = BertDictionary.load(os.path.join(args.data, 'dict.{}.txt'.format(args.source_lang))) if args.init_from_pretrained_doc_model: print('adding the [SENT_MASK] token? change it within Bert Special Tokens') pass # adding the [SENT_MASK] token? tgt_dict = FlexibleDictionary.load(os.path.join(args.data, 'dict.{}.txt'.format(args.target_lang))) print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) paths = utils.split_paths(args.data) assert len(paths) > 0 # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair(paths[0]) if args.source_lang is None or args.target_lang is None: raise Exception('Could not infer language pair, please provide it explicitly') # load dictionaries src_dict = cls.load_dictionary(os.path.join(paths[0], 'dict.src.txt')) tgt_dict = cls.load_dictionary(os.path.join(paths[0], 'dict.tgt.txt')) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() logger.info('[{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) logger.info('[{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair(args.data[0]) if args.source_lang is None or args.target_lang is None: raise Exception('Could not infer language pair, please provide it explicitly') # load dictionaries src_dict = BertDictionary.load(os.path.join(args.data[0], 'dict.{}.txt'.format(args.source_lang))) src_eos_idx = src_dict.add_special_token('[END_OF_SENT]') print('src_dict:[END_OF_SENT] id = {}, token = {}'.format(src_eos_idx, src_dict[src_eos_idx])) tgt_dict = BertDictionary.load(os.path.join(args.data[0], 'dict.{}.txt'.format(args.target_lang))) tgt_eos_idx = tgt_dict.add_special_token('[END_OF_SENT]') print('tgt_dict:[END_OF_SENT] id = {}, token = {}'.format(tgt_eos_idx, tgt_dict[tgt_eos_idx])) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() assert src_dict.sep() == tgt_dict.sep() print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, cfg: TranslationlfConfig, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ paths = utils.split_paths(cfg.data) assert len(paths) > 0 # find language pair automatically if cfg.source_lang is None or cfg.target_lang is None: cfg.source_lang, cfg.target_lang = data_utils.infer_language_pair( paths[0]) if cfg.source_lang is None or cfg.target_lang is None: raise Exception( "Could not infer language pair, please provide it explicitly") # load dictionaries src_dict = cls.load_dictionary( os.path.join(paths[0], "dict.{}.txt".format(cfg.source_lang))) tgt_dict = cls.load_dictionary( os.path.join(paths[0], "dict.{}.txt".format(cfg.target_lang))) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() logger.info("[{}] dictionary: {} types".format(cfg.source_lang, len(src_dict))) logger.info("[{}] dictionary: {} types".format(cfg.target_lang, len(tgt_dict))) #load here the longformer reps # todo check! # todo (next) load all of h5paths lf_reps = load_longformer_representations(cfg.lf_path) sen_doc_align = load_sen_doc_alignment(cfg.sen_doc) return cls(cfg, src_dict, tgt_dict, lf_reps, sen_doc_align)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair(args.data) if args.source_lang is None or args.target_lang is None: raise Exception('Could not infer language pair, please provide it explicitly') # load dictionaries src_dict = Dictionary.load(os.path.join(args.data, 'dict.{}.txt'.format(args.source_lang))) tgt_dict = Dictionary.load(os.path.join(args.data, 'dict.{}.txt'.format(args.target_lang))) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad_source = utils.eval_bool(args.left_pad_source) args.left_pad_target = utils.eval_bool(args.left_pad_target) paths = utils.split_paths(args.data) assert len(paths) > 0 # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( paths[0]) if args.source_lang is None or args.target_lang is None: raise Exception( "Could not infer language pair, please provide it explicitly") dictionary = cls.load_dictionary(os.path.join(paths[0], "dict.txt")) # langs:"en-zh,my-en" logger.info("args.add_lang_token: {} ".format(args.add_lang_token)) if args.add_lang_token and len(args.langs) > 0: languages = args.langs.split(",") for lang_pair in languages: if lang_pair == "-": continue logger.info("{} was add to dictionary".format(lang_pair)) lang = lang_pair.split("-") dictionary.add_symbol("[{}]".format(lang[0])) dictionary.add_symbol("[{}]".format(lang[1])) return cls(args, dictionary, dictionary)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) if getattr(args, 'raw_text', False): utils.deprecation_warning('--raw-text is deprecated, please use --dataset-impl=raw') args.dataset_impl = 'raw' elif getattr(args, 'lazy_load', False): utils.deprecation_warning('--lazy-load is deprecated, please use --dataset-impl=lazy') args.dataset_impl = 'lazy' paths = args.data.split(':') assert len(paths) > 0 # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair(paths[0]) if args.source_lang is None or args.target_lang is None: raise Exception('Could not infer language pair, please provide it explicitly') # load dictionaries src_dict = cls.load_dictionary(os.path.join(paths[0], 'dict.{}.txt'.format(args.source_lang))) tgt_dict = cls.load_dictionary(os.path.join(paths[0], 'dict.{}.txt'.format(args.target_lang))) char_dict = cls.load_dictionary(os.path.join(paths[0], 'dict_char.txt')) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict, char_dict)
def setup_task(cls, args, **kwargs): args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( args.data) if args.source_lang is None or args.target_lang is None: raise Exception( 'Could not infer language pair, please provide it explicitly') # load dictionaries src_dict = Dictionary.load( os.path.join(args.data, 'dict.{}.txt'.format(args.source_lang))) tgt_dict = Dictionary.load( os.path.join(args.data, 'dict.{}.txt'.format(args.target_lang))) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): """Setup GEC task, including dictionary & model building.""" """ Similar to the translation task, but also load labels dictionaries """ args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair(args.data[0]) if args.source_lang is None or args.target_lang is None: raise Exception('Could not infer language pair, please provide it explicitly') # load dictionaries src_dict = cls.load_dictionary(os.path.join(args.data[0], 'dict.{}.txt'.format(args.source_lang))) tgt_dict = cls.load_dictionary(os.path.join(args.data[0], 'dict.{}.txt'.format(args.target_lang))) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( args.data[0]) if args.source_lang is None or args.target_lang is None: raise Exception( 'Could not infer language pair, please provide it explicitly') src_dict = BertBasedDictionary(args.bert_name) tgt_dict = cls.load_dictionary( os.path.join(args.data[0], 'dict.{}.txt'.format(args.target_lang))) assert src_dict.pad() == tgt_dict.pad(), "%d != %d" % (src_dict.pad(), tgt_dict.pad()) assert src_dict.eos() == tgt_dict.eos(), "%d != %d" % (src_dict.eos(), tgt_dict.eos()) assert src_dict.unk() == tgt_dict.unk(), "%d != %d" % (src_dict.unk(), tgt_dict.unk()) print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ # left_pad_source=True, left_pad_target=False args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) # 例如source_lang=cn target_lang=en if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( args.data[0]) if args.source_lang is None or args.target_lang is None: raise Exception( 'Could not infer language pair, please provide it explicitly') #加载字典文件 src_dict = cls.load_dictionary( os.path.join(args.data[0], 'dict.{}.txt'.format(args.source_lang))) tgt_dict = cls.load_dictionary( os.path.join(args.data[0], 'dict.{}.txt'.format(args.target_lang))) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() print('| [{}] dictionary: {} 个字符'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} 个字符'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) assert not args.left_pad_source, f'args.left_pad_source must be False' # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair(args.data[0]) if args.source_lang is None or args.target_lang is None: raise Exception('Could not infer language pair, please provide it explicitly') # load dictionaries args.no_strip_node_label = getattr(args, 'no_strip_node_label', False) src_dict = DPTreeWrapperDictionary.load( os.path.join(args.data[0], 'dict.{}.txt'.format(args.source_lang)), no_strip_node_label=args.no_strip_node_label) tgt_dict = Dictionary.load(os.path.join(args.data[0], 'dict.{}.txt'.format(args.target_lang))) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() print('| [{}] DPtree-dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( args.data[0]) # load dictionary subword_dict = SubwordDictionary.load( os.path.join(args.data[0], 'model.vcb')) return cls(args, subword_dict)
def setup_task(cls, args, **kwargs): args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) args.trigram_block = options.eval_bool(args.trigram_block) args.init_from_pretrained_doc_model = options.eval_bool( args.init_from_pretrained_doc_model) # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( args.data) if args.source_lang is None or args.target_lang is None: raise Exception( 'Could not infer language pair, please provide it explicitly') # load dictionaries if args.roberta_model.startswith('roberta'): src_dict = GPT2Dictionary.load( os.path.join(args.data, 'dict.{}.txt'.format(args.source_lang))) else: src_dict = BertDictionary.load( os.path.join(args.data, 'dict.{}.txt'.format(args.source_lang))) idx = src_dict.add_special_token('<sent_mask>') print('<sent_mask> id = {}, token = {}'.format(idx, src_dict[idx])) print('<mask> id is', src_dict.index('<mask>')) print('<sent_mask> id is', src_dict.index('<sent_mask>')) # tgt_dict = FlexibleDictionary.load(os.path.join(args.data, 'dict.{}.txt'.format(args.target_lang))) # generate the tgt_dict tgt_dict = PointerFlexibleDictionary(args.max_doc_length, specialTokens=[('EOS', '</s>'), ('PAD', '<pad>'), ('UNK', '<unk>'), ('BOS', '<s>')]) assert tgt_dict.index('0') == 0 print('| WARNING: idx should should match the context in the tgt dict') # if args.predict_arch == 'pointer_net': # assert tgt_dict.eos() == args.max_doc_length print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, cfg: TranslationConfig, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ paths = utils.split_paths(cfg.data) assert len(paths) > 0 # find language pair automatically if cfg.source_lang is None or cfg.target_lang is None: cfg.source_lang, cfg.target_lang = data_utils.infer_language_pair( paths[0]) if cfg.source_lang is None or cfg.target_lang is None: raise Exception( "Could not infer language pair, please provide it explicitly") # load dictionaries bert_dict_langs: Set = set(cfg.use_bert_dict.split(",")) source_lang = cfg.source_lang if cfg.source_lang in bert_dict_langs: logger.info("Use DirctionaryForBert for {}".format(source_lang)) src_dict = DictionaryForBert.load( os.path.join(paths[0], "dict.{}.txt".format(source_lang))) else: logger.info("Use default Dirctionary for {}".format(source_lang)) src_dict = cls.load_dictionary( os.path.join(paths[0], "dict.{}.txt".format(source_lang))) target_lang = cfg.target_lang if cfg.target_lang in bert_dict_langs: logger.info("Use DirctionaryForBert for {}".format(target_lang)) tgt_dict = DictionaryForBert.load( os.path.join(paths[0], "dict.{}.txt".format(target_lang))) else: logger.info("Use default Dirctionary for {}".format(target_lang)) tgt_dict = cls.load_dictionary( os.path.join(paths[0], "dict.{}.txt".format(target_lang))) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() logger.info("[{}] dictionary: {} types".format(cfg.source_lang, len(src_dict))) logger.info("[{}] dictionary: {} types".format(cfg.target_lang, len(tgt_dict))) return cls(cfg, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair(args.data[0]) if args.source_lang is None or args.target_lang is None: raise Exception('Could not infer language pair, please provide it explicitly') user_data_frame = kwargs['user_data_frame'] task_score = kwargs['task_score'] src_dict = kwargs['src_dict'] tgt_dict = kwargs['tgt_dict'] return cls(args=args, src_dict=src_dict, tgt_dict=tgt_dict, user_data_frame=user_data_frame, task_score=task_score)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ # args.left_pad_source = options.eval_bool(args.left_pad_source) # args.left_pad_target = options.eval_bool(args.left_pad_target) if getattr(args, 'raw_text', False): utils.deprecation_warning('--raw-text is deprecated, please use --dataset-impl=raw') args.dataset_impl = 'raw' elif getattr(args, 'lazy_load', False): utils.deprecation_warning('--lazy-load is deprecated, please use --dataset-impl=lazy') args.dataset_impl = 'lazy' # wait-k try: args.wait_k = int(args.wait_k) except: if args.wait_k == "uniform": assert args.wait_k_sample_start < args.wait_k_sample_end elif args.wait_k == "CL-linear": assert args.wait_k_sample_start > args.wait_k_sample_end assert args.max_epoch <= 0 else: raise ValueError("Unsupported wait-k sampling method %s" % args.wait_k) paths = args.data.split(':') assert len(paths) > 0 # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair(paths[0]) if args.source_lang is None or args.target_lang is None: raise Exception('Could not test language pair, please provide it explicitly') # load dictionaries src_dict = cls.load_dictionary(os.path.join(paths[0], 'dict.{}.txt'.format(args.source_lang))) tgt_dict = cls.load_dictionary(os.path.join(paths[0], 'dict.{}.txt'.format(args.target_lang))) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) #could remove the following ..... # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( args.data) if args.source_lang is None or args.target_lang is None: raise Exception( 'Could not infer language pair, please provide it explicitly') # load dictionaries if args.flatenc or args.flatdec: flatData = args.flatdata if args.flatenc: flatFile = os.path.join(flatData, 'dict.{}.txt'.format(args.source_lang)) print("For flat encoder load dictionary: ", flatFile) src_dict = Dictionary.load(flatFile) else: src_dict = DictionaryWCS.load( os.path.join(args.data, 'dict.{}.txt'.format(args.source_lang))) if args.flatdec: flatFile = os.path.join(flatData, 'dict.{}.txt'.format(args.target_lang)) print("For flat decoder load dictionary: ", flatFile) tgt_dict = Dictionary.load(flatFile) else: tgt_dict = DictionaryWCS.load( os.path.join(args.data, 'dict.{}.txt'.format(args.target_lang))) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) args.trigram_block = options.eval_bool(args.trigram_block) args.init_from_pretrained_doc_model = options.eval_bool( args.init_from_pretrained_doc_model) if getattr(args, 'raw_text', False): utils.deprecation_warning( '--raw-text is deprecated, please use --dataset-impl=raw') args.dataset_impl = 'raw' elif getattr(args, 'lazy_load', False): utils.deprecation_warning( '--lazy-load is deprecated, please use --dataset-impl=lazy') args.dataset_impl = 'lazy' # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( args.data) if args.source_lang is None or args.target_lang is None: raise Exception( 'Could not infer language pair, please provide it explicitly') # load dictionaries src_dict = GPT2Dictionary.load( os.path.join(args.data, 'dict.{}.txt'.format(args.source_lang))) idx = src_dict.add_special_token('<sent_mask>') print('<sent_mask> id = {}, token = {}'.format(idx, src_dict[idx])) print('<mask> id is', src_dict.index('<mask>')) print('<sent_mask> id is', src_dict.index('<sent_mask>')) tgt_dict = FlexibleDictionary.load( os.path.join(args.data, 'dict.{}.txt'.format(args.target_lang))) # tgt_dict = None print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( args.data[0]) if args.source_lang is None or args.target_lang is None: raise Exception( 'Could not infer language pair, please provide it explicitly') # load dictionaries src_dict = Dictionary.load( os.path.join(args.data[0], 'dict.{}.txt'.format(args.source_lang))) tgt_dict = Dictionary.load( os.path.join(args.data[0], 'dict.{}.txt'.format(args.target_lang))) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() if not hasattr(args, 'device_id') or args.device_id == 0: print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) if hasattr(args, 'share_all_embeddings') and args.share_all_embeddings: src_dict.update(tgt_dict) tgt_dict = src_dict print("Join dictionary to share embeddings") print('| [{}] dictionary: {} types'.format( args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format( args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ if args.lang_pairs is None: raise ValueError( '--lang-pairs is required. List all the language pairs in the training objective.' ) if isinstance(args.lang_pairs, str): args.lang_pairs = args.lang_pairs.split(',') assert 'source-target' in args.lang_pairs and 'source-untarget' in args.lang_pairs args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) paths = utils.split_paths(args.data) assert len(paths) > 0 # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( paths[0]) if args.source_lang is None or args.target_lang is None: raise Exception( 'Could not infer language pair, please provide it explicitly') # load dictionaries src_dict = cls.load_dictionary( os.path.join(paths[0], 'dict.{}.txt'.format(args.source_lang))) tgt_dict = cls.load_dictionary( os.path.join(paths[0], 'dict.{}.txt'.format(args.target_lang))) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() logger.info('[{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) logger.info('[{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) if not hasattr(args, 'audio_input'): args.audio_input = False # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( args.data[0]) if args.source_lang is None or args.target_lang is None: raise Exception( 'Could not infer language pair, please provide it explicitly') # load dictionaries if not args.audio_input: src_dict = Dictionary.load( os.path.join(args.data[0], 'dict.{}.txt'.format(args.source_lang))) else: src_dict = AudioDictionary.load( os.path.join(args.data[0], 'dict.{}.txt'.format(args.source_lang))) tgt_dict = Dictionary.load( os.path.join(args.data[0], 'dict.{}.txt'.format(args.target_lang))) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) args.trigram_block = options.eval_bool(args.trigram_block) args.init_from_pretrained_doc_model = options.eval_bool( args.init_from_pretrained_doc_model) # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( args.data) if args.source_lang is None or args.target_lang is None: raise Exception( 'Could not infer language pair, please provide it explicitly') # load dictionaries if args.roberta_model.startswith('roberta'): src_dict = GPT2Dictionary.load( os.path.join(args.data, 'dict.{}.txt'.format(args.source_lang))) else: src_dict = BertDictionary.load( os.path.join(args.data, 'dict.{}.txt'.format(args.source_lang))) idx = src_dict.add_special_token('<sent_mask>') print('<sent_mask> id = {}, token = {}'.format(idx, src_dict[idx])) print('<mask> id is', src_dict.index('<mask>')) print('<sent_mask> id is', src_dict.index('<sent_mask>')) tgt_dict = FlexibleDictionary.load( os.path.join(args.data, 'dict.{}.txt'.format(args.target_lang))) print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ # get padding... args.left_pad_source = utils.eval_bool(args.left_pad_source) args.left_pad_target = utils.eval_bool(args.left_pad_target) paths = utils.split_paths(args.data) assert len(paths) > 0 # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( paths[0] ) print("path:",os.path.join(paths[0], "/Dicts/dict.txt")) dictionary = cls.load_dictionary( os.path.join(paths[0]+"/Dicts/", "dict.txt") ) return cls(args, dictionary,paths)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) s = args.word_mask_keep_rand.split(',') s = [float(x) for x in s] setattr(args, 'pred_probs', torch.FloatTensor([s[0], s[1], s[2]])) if getattr(args, 'raw_text', False): utils.deprecation_warning('--raw-text is deprecated, please use --dataset-impl=raw') args.dataset_impl = 'raw' elif getattr(args, 'lazy_load', False): utils.deprecation_warning('--lazy-load is deprecated, please use --dataset-impl=lazy') args.dataset_impl = 'lazy' paths = args.data.split(':') assert len(paths) > 0 # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair(paths[0]) if args.source_lang is None or args.target_lang is None: raise Exception('Could not infer language pair, please provide it explicitly') # load dictionaries src_dict = BertWordpieceDictionary.load(os.path.join(paths[0], 'dict.{}.txt'.format(args.source_lang))) tgt_dict = BertWordpieceDictionary.load(os.path.join(paths[0], 'dict.{}.txt'.format(args.target_lang))) print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad_source = utils.eval_bool(args.left_pad_source) args.left_pad_target = utils.eval_bool(args.left_pad_target) paths = utils.split_paths(args.data) assert len(paths) > 0 # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( paths[0]) if args.source_lang is None or args.target_lang is None: raise Exception( 'Could not infer language pair, please provide it explicitly') # load dictionaries if args.use_bert_model: tgt_first = True else: tgt_first = False if tgt_first: tgt_dict = cls.load_dictionary(os.path.join( paths[0], 'dict.{}.txt'.format(args.target_lang)), custom_bos=args.bos, custom_pad=args.pad, custom_eos=args.eos, custom_unk=args.unk, add_sentence_limit_words_after=True) bos_id_tgt = tgt_dict.bos() pad_id_tgt = tgt_dict.pad() eos_id_tgt = tgt_dict.eos() unk_id_tgt = tgt_dict.unk() src_dict = cls.load_dictionary(os.path.join( paths[0], 'dict.{}.txt'.format(args.source_lang)), custom_bos=args.bos, custom_pad=args.pad, custom_eos=args.eos, custom_unk=args.unk, add_sentence_limit_words_after=True, tgt_first=tgt_first, bos_id_tgt=bos_id_tgt, pad_id_tgt=pad_id_tgt, eos_id_tgt=eos_id_tgt, unk_id_tgt=unk_id_tgt) else: src_dict = cls.load_dictionary( os.path.join(paths[0], 'dict.{}.txt'.format(args.source_lang))) tgt_dict = cls.load_dictionary( os.path.join(paths[0], 'dict.{}.txt'.format(args.target_lang))) # print(src_dict.pad(), '', tgt_dict.pad()) # print(src_dict.bos(), '', tgt_dict.bos()) # print(src_dict.eos(), '', tgt_dict.eos()) # print(src_dict.unk(), '', tgt_dict.unk()) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() logger.info('[{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) logger.info('[{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) if getattr(args, 'raw_text', False): utils.deprecation_warning('--raw-text is deprecated, please use --dataset-impl=raw') args.dataset_impl = 'raw' elif getattr(args, 'lazy_load', False): utils.deprecation_warning('--lazy-load is deprecated, please use --dataset-impl=lazy') args.dataset_impl = 'lazy' paths = args.data.split(':') assert len(paths) > 0 # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair(paths[0]) if args.source_lang is None or args.target_lang is None: raise Exception('Could not infer language pair, please provide it explicitly') # load dictionaries assert args.target_lang == 'actions', 'target extension must be "actions"' args.target_lang_nopos = 'actions_nopos' # only build dictionary without pointer values args.target_lang_pos = 'actions_pos' args.target_lang_vocab_nodes = 'actions.vocab.nodes' args.target_lang_vocab_others = 'actions.vocab.others' src_dict = cls.load_dictionary(os.path.join(paths[0], 'dict.{}.txt'.format(args.source_lang))) # tgt_dict = cls.load_dictionary(os.path.join(paths[0], 'dict.{}.txt'.format(args.target_lang_nopos))) # NOTE rebuild the dictionary every time tgt_dict = cls.build_dictionary_bart_extend( node_freq_min=args.node_freq_min, node_file_path=os.path.join(paths[0], args.target_lang_vocab_nodes), others_file_path=os.path.join(paths[0], args.target_lang_vocab_others) ) # TODO target dictionary 'actions_nopos' is hard coded now; change it later assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang_nopos, len(tgt_dict))) # ========== load the pretrained BART model ========== if getattr(args, 'arch', None): # training time: pretrained BART needs to be used for initialization if 'bart_base' in args.arch or 'bartsv_base' in args.arch: print('-' * 10 + ' loading pretrained bart.base model ' + '-' * 10) bart = torch.hub.load('pytorch/fairseq', 'bart.base') elif 'bart_large' in args.arch or 'bartsv_large' in args.arch: print('-' * 10 + 'loading pretrained bart.large model ' + '-' * 10) bart = torch.hub.load('pytorch/fairseq', 'bart.large') else: raise ValueError else: # inference time: pretrained BART is only used for dictionary related things; size does not matter # NOTE size does matter; update this later in model initialization if model is with "bart.large" print('-' * 10 + ' (for bpe vocab and embed size at inference time) loading pretrained bart.base model ' + '-' * 10) bart = torch.hub.load('pytorch/fairseq', 'bart.base') bart.eval() # the pretrained BART model is only for assistance # ==================================================== return cls(args, src_dict, tgt_dict, bart)