示例#1
0
文件: cache.py 项目: mir-am/codeprep
def read_bpe_cache(file: str) -> Dict[str, List[str]]:
    words = {}
    with open(file, 'r') as f:
        for line in f:
            line = line.rstrip('\n')
            splits = line.split(KEY_VALUE_DELIM)
            second_column = to_non_literal_str(splits[1]).split(VALUE_PARTS_DELIM)
            words[to_non_literal_str(splits[0])] = second_column
    return words
示例#2
0
def _load_vocab_dict(file) -> Dict[str, int]:
    words = {}
    with open(file, 'r') as f:
        for line in f:
            line = line.rstrip('\n')
            splits = line.split(VOCAB_DICT_DELIM)
            words[to_non_literal_str(splits[0])] = int(splits[1])
    return words
示例#3
0
 def load_vocab(self):
     words = {}
     with open(self.path_to_vocab, 'r') as f:
         for line in f:
             line = line.rstrip('\n')
             splits = line.split(VOCAB_DICT_DELIM)
             words[to_non_literal_str(splits[0])] = int(splits[1])
     return words
示例#4
0
 def parse_file_entry(cls, line: str, priority: int) -> "Merge":
     try:
         spl = to_non_literal_str(line).split(" ")
         if len(spl) == 2:
             return cls((spl[0], spl[1]), priority=priority)
         else:
             return cls((spl[0], spl[1]),
                        freq=int(spl[2]),
                        priority=priority)
     except (IndexError, TypeError) as err:
         raise ValueError(f"Invalid merge entry format: {line}", err)
示例#5
0
def _load_vocab_set(file: str) -> Set[str]:
    non_bpe_tokens: Set[str] = set()
    with open(file, 'r') as f:
        for line in f:
            non_bpe_tokens.add(to_non_literal_str(line.rstrip('\n')))
    return non_bpe_tokens
示例#6
0
def load_nonbpe_vocab(dataset: Dataset) -> Set[str]:
    non_bpe_vocab = set()
    with open(dataset.path_to_nonbpe_vocab_file, 'r') as f:
        for line in f:
            non_bpe_vocab.add(to_non_literal_str(line.rstrip('\n')))
    return non_bpe_vocab