def read_bpe_cache(file: str) -> Dict[str, List[str]]: words = {} with open(file, 'r') as f: for line in f: line = line.rstrip('\n') splits = line.split(KEY_VALUE_DELIM) second_column = to_non_literal_str(splits[1]).split(VALUE_PARTS_DELIM) words[to_non_literal_str(splits[0])] = second_column return words
def _load_vocab_dict(file) -> Dict[str, int]: words = {} with open(file, 'r') as f: for line in f: line = line.rstrip('\n') splits = line.split(VOCAB_DICT_DELIM) words[to_non_literal_str(splits[0])] = int(splits[1]) return words
def load_vocab(self): words = {} with open(self.path_to_vocab, 'r') as f: for line in f: line = line.rstrip('\n') splits = line.split(VOCAB_DICT_DELIM) words[to_non_literal_str(splits[0])] = int(splits[1]) return words
def parse_file_entry(cls, line: str, priority: int) -> "Merge": try: spl = to_non_literal_str(line).split(" ") if len(spl) == 2: return cls((spl[0], spl[1]), priority=priority) else: return cls((spl[0], spl[1]), freq=int(spl[2]), priority=priority) except (IndexError, TypeError) as err: raise ValueError(f"Invalid merge entry format: {line}", err)
def _load_vocab_set(file: str) -> Set[str]: non_bpe_tokens: Set[str] = set() with open(file, 'r') as f: for line in f: non_bpe_tokens.add(to_non_literal_str(line.rstrip('\n'))) return non_bpe_tokens
def load_nonbpe_vocab(dataset: Dataset) -> Set[str]: non_bpe_vocab = set() with open(dataset.path_to_nonbpe_vocab_file, 'r') as f: for line in f: non_bpe_vocab.add(to_non_literal_str(line.rstrip('\n'))) return non_bpe_vocab