def make_vocab(filename, vocab_size, ngram_max, pct_bpe, sep, ignore_cols, v): ''' Creates word or byte-pair encoding vocabulary and mappings from a sample of text. Because this script will load the entire input text into memory, for large corpora it is recommended to use a representative sample of text. Vocabulary will be saved in a JSON file with the same base name as the input file, suffixed with "_word" or "_bpe" depending on the encoding used. ''' kind = _BPE if pct_bpe else _WORD with open(filename, 'r') as f: sample = f.readlines() new_sep = f' {sep} ' if kind == _BPE else ' ' sample = ['<s> ' + x.replace(' ', '_').replace(sep, new_sep) + '</s>' \ for i, x in enumerate(sample) if i not in ignore_cols] enc = Encoder(vocab_size, pct_bpe=pct_bpe, silent=not v, ngram_max=ngram_max, required_tokens={'<s>', '</s>'}, PAD='<pad>', UNK='<unk>') enc.fit(sample) enc.vocab_size = len(enc.word_vocab) + len(enc.bpe_vocab) enc.mute() dir_, name = split(filename) enc.save(join(dir_, name.split('.')[0] + f'_{kind[1]}.json'))
class BPE(object): def __init__(self, vocab_config, file_contents=None, vocab_path=None, out_vocab_path='vocab'): if vocab_path: self.encoder = self.load_vocab(vocab_path) else: self.encoder = Encoder(vocab_size=32000, pct_bpe=1.0, silent=False) def load_vocab(self, vocab_path): return Encoder.load(vocab_path) def save_vocab(self, path): self.encoder.save(path) def tokenize(self, line): return self.encoder.tokenize(line) def vocab_key(self, w): UNK = self.encoder.word_vocab[self.encoder.UNK] return self.encoder.bpe_vocab.get(w, UNK) def transform(self, line): return list( itertools.chain.from_iterable( self.encoder.transform(line, reverse=False, fixed_length=None))) @property def vocab_dim(self): return len(self.encoder.bpe_vocab)
def run_bpe(params): bpe_encoder = Encoder(vocab_size=params.vocab_size, pct_bpe=params.pct_bpe, silent=not params.verbose) if params.encoder_load_file: sys.stdout.write('Using pre-computed BPE encoder\n') sys.stdout.flush() bpe_encoder = Encoder.load(params.encoder_load_file) else: sys.stdout.write('Generating new BPE encoder\n') sys.stdout.flush() text = open(params.source_file).read().split('\n') bpe_encoder.fit(text) bpe_encoder.save(params.encoder_save_file) f_src = open(params.source_file) f_dst = open(params.destination_file, 'w') for line in tqdm.tqdm(f_src.readlines()): line = line.strip() tokens = bpe_encoder.tokenize(line) encoded_line = ' '.join(tokens).strip() if encoded_line.strip() != '': f_dst.write(encoded_line + '\n') f_src.close() f_dst.close()
def bpe_encoder_for_lines(cfg: Seq2SeqConfig, lines) -> Encoder: """ Calculate BPE encoder for provided lines of text """ encoder = Encoder(vocab_size=cfg.vocab_size, required_tokens=[ cfg.start_token, AT_TOKEN, HASH_TOKEN, SIGNATURE_TOKEN ]) encoder.fit(lines) encoder.save('latest_encoder.json') return encoder
def main(): ap = argparse.ArgumentParser() ap.add_argument("data", help="Path to data file") ap.add_argument("-v", "--vocabulary", help="Path to output vocab file") args = ap.parse_args() encoder = Encoder(vocab_size=32000, pct_bpe=1.0) with open(args.data) as f: data = json.load(f) data = list(get_data(data)) data = list(itertools.chain.from_iterable(data)) encoder.fit(data) encoder.save(args.vocabulary)