def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ paths = utils.split_paths(args['task']['data']) assert len(paths) > 0 # load dictionaries # src_dict = cls.load_dictionary(os.path.join(paths[0], 'csnjs_8k_9995p_unigram_url.dict.txt')) src_dict = Dictionary(extra_special_symbols=[ constants.CLS, constants.SEP, constants.MASK, constants.EOL, constants.URL ]) src_dict.add_from_file(args['dataset']['srcdict']) tgt_dict = Dictionary.load(args['dataset']['tgtdict']) # src_dict = cls.load_dictionary(os.path.join(paths[0], '{}.dict.txt'.format(args['task']['source_lang']))) # tgt_dict = cls.load_dictionary(os.path.join(paths[0], '{}.dict.txt'.format(args['task']['target_lang']))) # assert src_dict.pad() == tgt_dict.pad() # assert src_dict.eos() == tgt_dict.eos() # assert src_dict.unk() == tgt_dict.unk() # LOGGER.info('[{}] dictionary: {} types'.format(args['task']['source_lang'], len(src_dict))) # LOGGER.info('[{}] dictionary: {} types'.format(args['task']['target_lang'], len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def load_dictionary(cls, filename): """Load the dictionary from the filename Args: filename (str): the filename """ if filename.endswith('.txt'): dictionary = Dictionary(extra_special_symbols=[ constants.CLS, constants.SEP, constants.MASK, constants.EOL, constants.URL ]) dictionary.add_from_file(filename) else: dictionary = Dictionary(extra_special_symbols=[ constants.CLS, constants.SEP, constants.MASK, constants.EOL, constants.URL ]).add_from_json_file(filename) return dictionary