def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair(args.data) if args.source_lang is None or args.target_lang is None: raise Exception('Could not infer language pair, please provide it explicitly') # load dictionaries src_dict = Dictionary.load(os.path.join(args.data, 'dict.{}.txt'.format(args.source_lang))) tgt_dict = Dictionary.load(os.path.join(args.data, 'dict.{}.txt'.format(args.target_lang))) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ dictionary = Dictionary.load(os.path.join(args.data, 'dict.txt')) print('| dictionary: {} types'.format(len(dictionary))) return cls(args, dictionary)
def setup_task(cls, args, **kwargs): """Setup the task.""" paths = utils.split_paths(args.data) assert len(paths) > 0 dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) logger.info("dictionary: {} types".format(len(dictionary))) if not hasattr(args, "shuffle_instance"): args.shuffle_instance = False return cls(args, dictionary)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries).""" dict_path = os.path.join(args.data, "dict.txt") if not os.path.isfile(dict_path): raise FileNotFoundError("Dict not found: {}".format(dict_path)) tgt_dict = Dictionary.load(dict_path) print("| dictionary: {} types".format(len(tgt_dict))) return cls(args, tgt_dict)
def initializer(self): global bpe bpe = get_encoder( os.path.join(self.roberta_dir, 'gpt2_bpe', 'encoder.json'), os.path.join(self.roberta_dir, 'gpt2_bpe', 'vocab.bpe'), ) global vocab vocab = Dictionary.load( os.path.join(self.roberta_dir, 'roberta.base', 'dict.txt'))
def load_dictionary(cls, filename, use_ctc_loss): """Load the dictionary from the filename Args: filename (str): the filename """ if use_ctc_loss: return CTCLossDictionary.load(filename) return Dictionary.load(filename)
def __init__(self, vncore=True): """ Hacky way to run VnCoreNLP tokenizer with PhoBERT :param vncore: Set it to `False` if your sentences are already tokenized by VnCoreNLP """ self.dictionary = Dictionary.load(open(DICT_PATH)) self.annotator = None self.vncore = vncore self.bpe = fastBPE(args)
def setup_task(cls, cfg: SpanMaskedLMConfig, **kwargs): """Setup the task.""" paths = utils.split_paths(cfg.data) assert len(paths) > 0 dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) logger.info("dictionary: {} types".format(len(dictionary))) if not hasattr(cfg, "shuffle"): cfg.shuffle = False return cls(cfg, dictionary)
def load_dictionary(cls, args, filename, source=True): """Load the dictionary from the filename Args: filename (str): the filename """ dictionary = Dictionary.load(filename) dictionary.add_symbol('<mask>') return dictionary
def setup_task(cls, cfg: HubertPretrainingConfig, **kwargs) -> "HubertPretrainingTask": label_dir = cfg.data if cfg.label_dir is None else cfg.label_dir dictionaries = { label: Dictionary.load(f"{label_dir}/dict.{label}.txt") if os.path.exists(f"{label_dir}/dict.{label}.txt") else None for label in cfg.labels } return cls(cfg, dictionaries)
def load_target_dictionary(self): if self.cfg.labels: dict_path = os.path.join(self.cfg.data, f"dict.{self.cfg.labels}.txt") if not os.path.isfile(dict_path): dict_path = os.path.join(self.cfg.label_dir, f"dict.{self.cfg.labels}.txt") return Dictionary.load(dict_path) return None
def load_dictionary(cls, filename): """Load the dictionary from the filename Args: filename (str): the filename """ dictionary = Dictionary.load(filename) dictionary.add_symbol("<mask>") return dictionary
def test_finalize(self): txt = [ 'A B C D', 'B C D', 'C D', 'D', ] ref_ids1 = list( map(torch.IntTensor, [ [4, 5, 6, 7, 2], [5, 6, 7, 2], [6, 7, 2], [7, 2], ])) ref_ids2 = list( map(torch.IntTensor, [ [7, 6, 5, 4, 2], [6, 5, 4, 2], [5, 4, 2], [4, 2], ])) # build dictionary d = Dictionary() for line in txt: Tokenizer.tokenize(line, d, add_if_not_exist=True) def get_ids(dictionary): ids = [] for line in txt: ids.append( Tokenizer.tokenize(line, dictionary, add_if_not_exist=False)) return ids def assertMatch(ids, ref_ids): for toks, ref_toks in zip(ids, ref_ids): self.assertEqual(toks.size(), ref_toks.size()) self.assertEqual(0, (toks != ref_toks).sum().item()) ids = get_ids(d) assertMatch(ids, ref_ids1) # check finalized dictionary d.finalize() finalized_ids = get_ids(d) assertMatch(finalized_ids, ref_ids2) # write to disk and reload with tempfile.NamedTemporaryFile(mode='w') as tmp_dict: d.save(tmp_dict.name) d = Dictionary.load(tmp_dict.name) reload_ids = get_ids(d) assertMatch(reload_ids, ref_ids2) assertMatch(finalized_ids, reload_ids)
def load_dictionary(cls, filename, sde=False): """Load the dictionary from the filename Args: filename (str): the filename """ if sde: return CharNgramDictionary.load(filename) else: return Dictionary.load(filename)
def tokenizer(sentence): attrs = Args() tokenizer = MosesTokenizer(attrs) bpe = SubwordNMTBPE(attrs) dictionary = Dictionary.load('dict.en.txt') return dictionary.encode_line(bpe.encode(sentence), add_if_not_exist=False)
def load_dictionary(cls, filename, weight_by_freq=False): """Load the dictionary from the filename Args: filename (str): the filename """ if weight_by_freq: return DictionaryWithInvFreqWeight.load(filename) else: return Dictionary.load(filename)
def setup_task(cls, args, **kwargs): args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) #could remove the following ..... # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( args.data) if args.source_lang is None or args.target_lang is None: raise Exception( 'Could not infer language pair, please provide it explicitly') # load dictionaries if args.flatenc or args.flatdec: flatData = args.flatdata if args.flatenc: flatFile = os.path.join(flatData, 'dict.{}.txt'.format(args.source_lang)) print("For flat encoder load dictionary: ", flatFile) src_dict = Dictionary.load(flatFile) else: src_dict = DictionaryWCS.load( os.path.join(args.data, 'dict.{}.txt'.format(args.source_lang))) if args.flatdec: flatFile = os.path.join(flatData, 'dict.{}.txt'.format(args.target_lang)) print("For flat decoder load dictionary: ", flatFile) tgt_dict = Dictionary.load(flatFile) else: tgt_dict = DictionaryWCS.load( os.path.join(args.data, 'dict.{}.txt'.format(args.target_lang))) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def load_pretrained_model(path, src_dict_path, tgt_dict_path, arg_overrides=None): model = utils.load_checkpoint_to_cpu(path) args = model['args'] state_dict = model['model'] args = utils.override_model_args(args, arg_overrides) src_dict = Dictionary.load(src_dict_path) tgt_dict = Dictionary.load(tgt_dict_path) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() task = TranslationTask(args, src_dict, tgt_dict) model = task.build_model(args) model.upgrade_state_dict(state_dict) model.load_state_dict(state_dict, strict=True) return model
def setup_task(cls, args, **kwargs): """Setup the task. """ paths = args.data.split(':') assert len(paths) > 0 dictionary = Dictionary.load(os.path.join(paths[0], 'dict.txt')) logger.info('dictionary: {} types'.format(len(dictionary))) if not hasattr(args, 'shuffle_instance'): args.shuffle_instance = False return cls(args, dictionary)
def setup_task(cls, args, **kwargs): # load data dictionary data_dictionary_dict = {} for field in configs.fields: data_dictionary_dict[field] = Dictionary.load(os.path.join(args.data, 'input0', f'{field}', 'dict.txt')) if field in configs.maskable_fields: data_dictionary_dict[field].add_symbol('<mask>') # to align with the dictionary used in pretraining logger.info('[input {}] dictionary: {} types'.format(field, len(data_dictionary_dict[field]))) label_dict = data_dictionary_dict # dummy set as we don't have discrete label return SimilarityTask(args, data_dictionary_dict, label_dict)
def setup_task(cls, args, **kwargs): paths = utils.split_paths(args.data) assert len(paths) > 0 dictionary = None if args.use_bert_dict: dictionary = DictionaryForBert.load( os.path.join(paths[0], "dict.txt")) else: dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) logger.info("dictionary: {} types".format(len(dictionary))) return cls(args, dictionary)
def setup_task(cls, args, **kwargs): args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair(args.data) if args.source_lang is None or args.target_lang is None: raise Exception('Could not infer language pair, please provide it explicitly') # load dictionaries src_dict = Dictionary.load(os.path.join(args.data, 'dict.{}.txt'.format(args.source_lang))) tgt_dict = Dictionary.load(os.path.join(args.data, 'dict.{}.txt'.format(args.target_lang))) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): """ https://github.com/pytorch/fairseq/blob/master/fairseq/tasks/masked_lm.py#L78 """ paths = utils.split_paths(args.data) assert len(paths) > 0 data_dict = cls.load_dictionary( os.path.join(paths[0], 'input', 'dict.txt')) logger.info('dictionary: {} types'.format(len(data_dict))) meta_dict = Dictionary.load(os.path.join(paths[0], 'meta', 'dict.txt')) return cls(args, data_dict, meta_dict)
def initializer(self): global bpe bpe = get_encoder( os.path.join(self.roberta_dir, 'encoder.json'), os.path.join(self.roberta_dir, 'vocab.bpe'), ) global vocab vocab = Dictionary.load(os.path.join(self.roberta_dir, 'dict.txt')) global entities if self.entity_vocab is not None: entities = load_entities(self.entity_vocab)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( args.data[0]) if args.source_lang is None or args.target_lang is None: raise Exception( 'Could not infer language pair, please provide it explicitly') # load dictionaries src_dict = Dictionary.load( os.path.join(args.data[0], 'dict.{}.txt'.format(args.source_lang))) tgt_dict = Dictionary.load( os.path.join(args.data[0], 'dict.{}.txt'.format(args.target_lang))) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() if not hasattr(args, 'device_id') or args.device_id == 0: print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) if hasattr(args, 'share_all_embeddings') and args.share_all_embeddings: src_dict.update(tgt_dict) tgt_dict = src_dict print("Join dictionary to share embeddings") print('| [{}] dictionary: {} types'.format( args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format( args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.source_lang = f"{args.source_lang1}_{args.source_lang2}" args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( args.data[0]) if args.source_lang is None or args.target_lang is None: raise Exception( 'Could not infer language pair, please provide it explicitly') # load dictionaries src_dict1 = Dictionary.load( os.path.join(args.data[0], 'dict.{}.txt'.format(args.source_lang1))) src_dict2 = Dictionary.load( os.path.join(args.data[0], 'dict.{}.txt'.format(args.source_lang2))) tgt_dict = Dictionary.load( os.path.join(args.data[0], 'dict.{}.txt'.format(args.target_lang))) assert src_dict1.pad() == src_dict2.pad() assert src_dict1.eos() == src_dict2.eos() assert src_dict1.unk() == src_dict2.unk() assert src_dict1.pad() == tgt_dict.pad() assert src_dict1.eos() == tgt_dict.eos() assert src_dict1.unk() == tgt_dict.unk() print('| [{}] dictionary: {} types'.format(args.source_lang1, len(src_dict1))) print('| [{}] dictionary: {} types'.format(args.source_lang2, len(src_dict2))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict1, src_dict2, tgt_dict)
def load_dictionary(cls, args, filename, source=True): """Load the dictionary from the filename Args: filename (str): the filename """ dictionary = Dictionary.load(filename) ## langs = args.langs.split(",") for l in langs: dictionary.add_symbol("[{}]".format(l)) dictionary.add_symbol("<mask>") return dictionary
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries).""" task = super(SpeechTranslationCTCTask, cls).setup_task(args) source_dict_path = os.path.join(args.data.split(os.pathsep)[0], "dict.{}.txt".format(args.source_lang)) if not os.path.isfile(source_dict_path): raise FileNotFoundError("Dict not found: {}".format(source_dict_path)) src_dict = Dictionary.load(source_dict_path) if args.criterion == "ctc_multi_loss": src_dict.add_symbol("<ctc_blank>") print("| CTC dictionary: {} types".format(len(src_dict))) task.src_dict = src_dict return task
def __init__( self, data_dir, split, sample_rate, max_sample_size=None, min_sample_size=0, shuffle=True, pad=False, normalize=False, num_buckets=0, compute_mask_indices=False, **mask_compute_kwargs, ): super().__init__( sample_rate=sample_rate, max_sample_size=max_sample_size, min_sample_size=min_sample_size, shuffle=shuffle, pad=pad, normalize=normalize, compute_mask_indices=compute_mask_indices, **mask_compute_kwargs, ) from fairseq.data import data_utils, Dictionary self.fnames_dict = Dictionary.load(os.path.join(data_dir, "dict.txt")) root_path = os.path.join(data_dir, f"{split}.root") if os.path.exists(root_path): with open(root_path, "r") as f: self.root_dir = next(f).strip() else: self.root_dir = None fnames_path = os.path.join(data_dir, split) self.fnames = data_utils.load_indexed_dataset(fnames_path, self.fnames_dict) lengths_path = os.path.join(data_dir, f"{split}.lengths") with open(lengths_path, "r") as f: for line in f: sz = int(line.rstrip()) assert ( sz >= min_sample_size ), f"Min sample size is not supported for binarized dataset, but found a sample with size {sz}" self.sizes.append(sz) self.sizes = np.array(self.sizes, dtype=np.int64) self.set_bucket_info(num_buckets) logger.info(f"loaded {len(self.fnames)} samples")
def load_dictionary(cls, args, filename, source=True): """Load the dictionary from the filename Args: filename (str): the filename """ dictionary = Dictionary.load(filename) dictionary.add_symbol("<mask>") #cls.Q_token = dictionary.add_symbol("<Q>") #cls.A_token = dictionary.add_symbol("<A>") return dictionary
def setup_task(cls, args, **kwargs): paths = args.data.split(':') assert len(paths) > 0 if 'bert' in args and args.bert: print('| bert dictionary') dictionary = BertDictionary() else: dictionary = Dictionary.load(os.path.join(paths[0],'dict.txt')) print('| dictionary: {} types'.format(len(dictionary))) if args.freq_weighted_replacement: print('| freq weighted mask replacement') return cls(args, dictionary)
def test_finalize(self): txt = [ 'A B C D', 'B C D', 'C D', 'D', ] ref_ids1 = list(map(torch.IntTensor, [ [4, 5, 6, 7, 2], [5, 6, 7, 2], [6, 7, 2], [7, 2], ])) ref_ids2 = list(map(torch.IntTensor, [ [7, 6, 5, 4, 2], [6, 5, 4, 2], [5, 4, 2], [4, 2], ])) # build dictionary d = Dictionary() for line in txt: Tokenizer.tokenize(line, d, add_if_not_exist=True) def get_ids(dictionary): ids = [] for line in txt: ids.append(Tokenizer.tokenize(line, dictionary, add_if_not_exist=False)) return ids def assertMatch(ids, ref_ids): for toks, ref_toks in zip(ids, ref_ids): self.assertEqual(toks.size(), ref_toks.size()) self.assertEqual(0, (toks != ref_toks).sum().item()) ids = get_ids(d) assertMatch(ids, ref_ids1) # check finalized dictionary d.finalize() finalized_ids = get_ids(d) assertMatch(finalized_ids, ref_ids2) # write to disk and reload with tempfile.NamedTemporaryFile(mode='w') as tmp_dict: d.save(tmp_dict.name) d = Dictionary.load(tmp_dict.name) reload_ids = get_ids(d) assertMatch(reload_ids, ref_ids2) assertMatch(finalized_ids, reload_ids)