def load_dictionary(cls, filename): """Load the dictionary from the filename Args: filename (str): the filename """ return Dictionary.load(filename)
def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8): """Build the dictionary Args: filenames (list): list of filenames workers (int): number of concurrent workers threshold (int): defines the minimum word count nwords (int): defines the total number of words in the final dictionary, including special symbols padding_factor (int): can be used to pad the dictionary size to be a multiple of 8, which is important on some hardware (e.g., Nvidia Tensor Cores). """ d = Dictionary() for filename in filenames: Dictionary.add_file_to_dictionary(filename, d, tokenizer.tokenize_line, workers) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d
def setup_dictionary(cls, args, **kwargs): dictionary = None output_dictionary = None if args.data: paths = utils.split_paths(args.data) assert len(paths) > 0 dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) logger.info("dictionary: {} types".format(len(dictionary))) output_dictionary = dictionary if args.output_dictionary_size >= 0: output_dictionary = TruncatedDictionary( dictionary, args.output_dictionary_size) return (dictionary, output_dictionary)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries).""" dict_path = os.path.join(args.data, "dict.txt") if not os.path.isfile(dict_path): raise FileNotFoundError("Dict not found: {}".format(dict_path)) tgt_dict = Dictionary.load(dict_path) if args.criterion == "ctc_loss": tgt_dict.add_symbol("<ctc_blank>") elif args.criterion == "asg_loss": for i in range(1, args.max_replabel + 1): tgt_dict.add_symbol(replabel_symbol(i)) print("| dictionary: {} types".format(len(tgt_dict))) return cls(args, tgt_dict)
def load_dataset(self, split, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ manifest = os.path.join(self.args.data, "{}.tsv".format(split)) self.datasets[split] = FileAudioDataset( manifest, sample_rate=self.args.sample_rate, max_sample_size=self.args.max_sample_size, min_sample_size=self.args.max_sample_size, min_length=self.args.min_sample_size, pad=self.args.labels is not None or self.args.enable_padding, normalize=self.args.normalize, ) dict_path = os.path.join(self.args.data, f"dict.{self.args.labels}.txt") self._target_dictionary = Dictionary.load(dict_path) label_path = os.path.join(self.args.data, f"{split}.{self.args.labels}") labels = [] with open(label_path, "r") as f: for line in f: labels.append(line) process_label = LabelEncoder(self.target_dictionary) self.datasets[split] = AddTargetDataset( self.datasets[split], labels, bos=self.target_dictionary.bos(), pad=self.target_dictionary.pad(), eos=None, batch_targets=True, process_label=process_label, add_to_input=False, )
def padding_idx(self): return Dictionary().pad() if self.vocab is None else self.vocab.pad()