def learn_joint_bpe_and_vocab(args): if args.vocab and len(args.input) != len(args.vocab): sys.stderr.write( 'Error: number of input files and vocabulary files must match\n') sys.exit(1) # read/write files as UTF-8 args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input] args.vocab = [ codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab ] # get combined vocabulary of all input texts full_vocab = Counter() for f in args.input: full_vocab += learn_bpe.get_vocabulary(f) f.seek(0) vocab_list = [ '{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items() ] # learn BPE on combined vocabulary with codecs.open(args.output.name, 'w', encoding='UTF-8') as output: learn_bpe.learn_bpe(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True) with codecs.open(args.output.name, encoding='UTF-8') as codes: bpe = apply_bpe.BPE(codes, separator=args.separator) # apply BPE to each training corpus and get vocabulary for train_file, vocab_file in zip(args.input, args.vocab): tmp = tempfile.NamedTemporaryFile(delete=False) tmp.close() tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8') train_file.seek(0) for line in train_file: tmpout.write(bpe.segment(line).strip()) tmpout.write('\n') tmpout.close() tmpin = codecs.open(tmp.name, encoding='UTF-8') vocab = learn_bpe.get_vocabulary(tmpin) tmpin.close() os.remove(tmp.name) for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True): vocab_file.write("{0} {1}\n".format(key, freq)) vocab_file.close()
def learn_bpe(self, item_list, from_filenames=True): logging.info('generating bpe codes file. saving to %s' % self.codes_file) if from_filenames: filenames = item_list if isinstance(filenames, str): filenames = [filenames] # get combined vocabulary of all input files full_vocab = OrderedCounter() for fname in filenames: with codecs.open(fname, encoding='UTF-8') as f: full_vocab += learn_bpe.get_vocabulary(f) else: # get combined vocabulary of all input texts full_vocab = OrderedCounter() full_vocab += learn_bpe.get_vocabulary(item_list) vocab_list = [ '{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items() ] # learn BPE on combined vocabulary with codecs.open(self.codes_file, 'w', encoding='UTF-8') as output: learn_bpe.main(vocab_list, output, self.num_symbols, self.min_frequency, False, is_dict=True) self.set_bpe(self.codes_file)
def main(input, output_name, vocab, symbols, separator='@@', min_frequency=2, verbose=False): # read/write files as UTF-8 input = [codecs.open(f, encoding='UTF-8') for f in input] vocab = [codecs.open(f, 'w', encoding='UTF-8') for f in vocab] # get combined vocabulary of all input texts full_vocab = Counter() for f in input: full_vocab += learn_bpe.get_vocabulary(f) f.seek(0) vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()] # learn BPE on combined vocabulary with codecs.open(output_name, 'w', encoding='UTF-8') as output: learn_bpe.main(vocab_list, output, symbols, min_frequency, verbose, is_dict=True) with codecs.open(output_name, encoding='UTF-8') as codes: bpe = apply_bpe.BPE(codes, separator, None) # apply BPE to each training corpus and get vocabulary for train_file, vocab_file in zip(input, vocab): tmp = tempfile.NamedTemporaryFile(delete=False) tmp.close() tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8') train_file.seek(0) for line in train_file: tmpout.write(bpe.segment(line).strip()) tmpout.write('\n') tmpout.close() tmpin = codecs.open(tmp.name, encoding='UTF-8') vocab = learn_bpe.get_vocabulary(tmpin) tmpin.close() os.remove(tmp.name) for key, freq in sorted(vocab.items(), key=lambda x: x[1], reverse=True): vocab_file.write("{0} {1}\n".format(key, freq)) vocab_file.close()
def main(args): vocab = learn_bpe.get_vocabulary(args.input, args.dict_input, args.mincount) assert isinstance(vocab, Counter) bpe = apply_bpe.BPE(args.codes, args.separator, vocab=None, unkchar=args.unkchar, unktag=args.unktag) bpevocab = learn_bpe.restricted_vocabulary(bpe, vocab) if args.outcodes is not None: bpe.write_subset(args.outcodes, bpevocab) if args.bpevocab is not None: learn_bpe.write_vocabulary(bpevocab, args.bpevocab)
if args.vocab and len(args.input) != len(args.vocab): sys.stderr.write( 'Error: number of input files and vocabulary files must match\n') sys.exit(1) # read/write files as UTF-8 args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input] args.vocab = [ codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab ] # get combined vocabulary of all input texts full_vocab = Counter() for f in args.input: full_vocab += learn_bpe.get_vocabulary(f) f.seek(0) vocab_list = [ '{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items() ] # learn BPE on combined vocabulary with codecs.open(args.output.name, 'w', encoding='UTF-8') as output: learn_bpe.main(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True)
args = parser.parse_args() if args.vocab and len(args.input) != len(args.vocab): sys.stderr.write( 'Error: number of input files and vocabulary files must match\n') sys.exit(1) # read/write files as UTF-8 args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input] args.vocab = [ codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab ] # get combined vocabulary of all input texts full_vocab = Counter() vocabs = [] for f in args.input: v = learn_bpe.get_vocabulary(f, args.dict_input, args.mincount) vocabs.append(v) full_vocab += v f.seek(0) # learn BPE on combined vocabulary with codecs.open(args.output.name, 'w', encoding='UTF-8') as output: learn_bpe.main_args(args, full_vocab, output, is_dict=True) with codecs.open(args.output.name, encoding='UTF-8') as codes: bpe = apply_bpe.BPE(codes, args.separator, None) # apply BPE to each training corpus and get vocabulary learn_bpe.make_vocabularies(bpe, vocabs, args.vocab)
def learn_joint_bpe_and_vocab(args): if args.vocab and len(args.input) != len(args.vocab): sys.stderr.write( 'Error: number of input files and vocabulary files must match\n') sys.exit(1) # read/write files as UTF-8 args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input] args.vocab = [ codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab ] if args.special_vocab: with codecs.open(args.special_vocab, encoding='UTF-8') as f: l = [ line.strip('\r\n ') for line in codecs.open(args.special_vocab, encoding='UTF-8') ] args.special_vocab = l # get combined vocabulary of all input texts full_vocab = Counter() for f in args.input: full_vocab += learn_bpe.get_vocabulary(f, args.dict_input) f.seek(0) if args.special_vocab: for word in args.special_vocab: full_vocab[word] += 1 # integrate special vocab to full_vocab vocab_list = yield_dict_lines(full_vocab) # learn BPE on combined vocabulary with codecs.open(args.output.name, 'w', encoding='UTF-8') as output: learn_bpe.learn_bpe(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True, total_symbols=args.total_symbols, is_postpend=args.postpend, special_vocab=args.special_vocab) with codecs.open(args.output.name, encoding='UTF-8') as codes: bpe = apply_bpe.BPE(codes, separator=args.separator, is_postpend=args.postpend) # apply BPE to each training corpus and get vocabulary for train_file, vocab_file in zip(args.input, args.vocab): if args.dict_input: vocab = Counter() for i, line in enumerate(train_file): try: word, count = line.strip('\r\n ').split(' ') segments = bpe.segment_tokens([word]) except: print('Failed reading vocabulary file at line {0}: {1}'. format(i, line)) sys.exit(1) for seg in segments: vocab[seg] += int(count) else: tmp = tempfile.NamedTemporaryFile(delete=False) tmp.close() tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8') train_file.seek(0) for line in train_file: tmpout.write(bpe.process_line(line).strip()) tmpout.write('\n') tmpout.close() tmpin = codecs.open(tmp.name, encoding='UTF-8') vocab = learn_bpe.get_vocabulary(tmpin) tmpin.close() os.remove(tmp.name) # if special vocab is defined, include them if args.special_vocab: for i, word in enumerate(args.special_vocab): try: segments = bpe.segment_tokens([word]) except: print( 'Failed reading special vocabulary file at line {0}: {1}' .format(i, line)) sys.exit(1) if len(segments) != 1: sys.stderr.write( 'WARNING: special vocab \'{0}\' not captured by merges, split into \'{1}\'\n' .format(word, ' '.join(segments))) for seg in segments: vocab[seg] += 1 sys.stderr.write('Vocabulary got {0:d} unique items\n'.format( len(vocab))) # if character vocab is to be included if args.character_vocab: char_internal, char_terminal = learn_bpe.extract_uniq_chars( full_vocab, args.postpend) sys.stderr.write( 'Got {0:d} non-terminal and {1:d} terminal characters\n'. format(len(char_internal), len(char_terminal))) pseudo_count_terminal = max( vocab.values()) + 2 # always precedes non-terminal pseudo_count_internal = max( vocab.values()) + 1 # always precedes other items for c in char_terminal: vocab[c] = pseudo_count_terminal for c in char_internal: c = '{0}{1}'.format(args.separator, c) if args.postpend else '{0}{1}'.format( c, args.separator) vocab[c] = pseudo_count_internal for key, freq in sorted(vocab.items(), key=lambda x: (-x[1], x[0])): vocab_file.write("{0} {1}\n".format(key, freq)) train_file.close() vocab_file.close()
parser = create_parser() args = parser.parse_args() if args.vocab and len(args.input) != len(args.vocab): sys.stderr.write('Error: number of input files and vocabulary files must match\n') sys.exit(1) # read/write files as UTF-8 args.input = [codecs.open(f.name, encoding='UTF-8') for f in args.input] args.vocab = [codecs.open(f.name, 'w', encoding='UTF-8') for f in args.vocab] # get combined vocabulary of all input texts full_vocab = Counter() for f in args.input: full_vocab += learn_bpe.get_vocabulary(f) f.seek(0) vocab_list = ['{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()] # learn BPE on combined vocabulary with codecs.open(args.output.name, 'w', encoding='UTF-8') as output: learn_bpe.main(vocab_list, output, args.symbols, args.min_frequency, args.verbose, is_dict=True) with codecs.open(args.output.name, encoding='UTF-8') as codes: bpe = apply_bpe.BPE(codes, separator=args.separator) # apply BPE to each training corpus and get vocabulary for train_file, vocab_file in zip(args.input, args.vocab): tmp = tempfile.NamedTemporaryFile(delete=False)