def setUp(self): with codecs.open(os.path.join(currentdir,'data','bpe.ref'), encoding='utf-8') as bpefile: self.bpe = BPE(bpefile) self.infile = codecs.open(os.path.join(currentdir,'data','corpus.en'), encoding='utf-8') self.reffile = codecs.open(os.path.join(currentdir,'data','corpus.bpe.ref.en'), encoding='utf-8')
def setUp(self): amock = mock.MagicMock() amock.readline.return_value = 'something' glossaries = ['like', 'USA'] rglossaries = ['M[Manuel]*l'] self.bpe = BPE(amock, glossaries=glossaries, rglossaries=rglossaries)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, description="subword-nmt: unsupervised word segmentation for neural machine translation and text generation ") subparsers = parser.add_subparsers(dest='command', help="""command to run. Run one of the commands with '-h' for more info. learn-bpe: learn BPE merge operations on input text. apply-bpe: apply given BPE operations to input text. get-vocab: extract vocabulary and word frequencies from input text. learn-joint-bpe-and-vocab: executes recommended workflow for joint BPE.""") learn_bpe_parser = create_learn_bpe_parser(subparsers) apply_bpe_parser = create_apply_bpe_parser(subparsers) get_vocab_parser = create_get_vocab_parser(subparsers) learn_joint_bpe_and_vocab_parser = create_learn_joint_bpe_and_vocab_parser(subparsers) args = parser.parse_args() if args.command == 'learn-bpe': # read/write files as UTF-8 if args.input.name != '<stdin>': args.input = codecs.open(args.input.name, encoding='utf-8') if args.output.name != '<stdout>': args.output = codecs.open(args.output.name, 'w', encoding='utf-8') learn_bpe(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input) elif args.command == 'apply-bpe': # read/write files as UTF-8 args.codes = codecs.open(args.codes.name, encoding='utf-8') if args.input.name != '<stdin>': args.input = codecs.open(args.input.name, encoding='utf-8') if args.output.name != '<stdout>': args.output = codecs.open(args.output.name, 'w', encoding='utf-8') if args.vocabulary: args.vocabulary = codecs.open(args.vocabulary.name, encoding='utf-8') if args.vocabulary: vocabulary = read_vocabulary(args.vocabulary, args.vocabulary_threshold) else: vocabulary = None bpe = BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries) for line in args.input: args.output.write(bpe.process_line(line)) elif args.command == 'get-vocab': if args.input.name != '<stdin>': args.input = codecs.open(args.input.name, encoding='utf-8') if args.output.name != '<stdout>': args.output = codecs.open(args.output.name, 'w', encoding='utf-8') get_vocab(args.input, args.output) elif args.command == 'learn-joint-bpe-and-vocab': learn_joint_bpe_and_vocab(args) else: raise Exception('Invalid command provided')
def __init__(self, opt): self.opt = opt self.sep = opt.seprator + " " if opt.cuda: torch.cuda.set_device(opt.gpu) self.bpe = BPE(codecs.open(self.opt.bpe_codes, 'r', encoding="UTF-8"), self.opt.seprator, None, None) self.translator = onmt.Translator(opt) self.nlp = BosonNLP("NGhNiav2.16134.DvyEDmGzYd2S")
def __init__(self, opt): self.opt = opt self.sep = opt.seprator + " " if opt.cuda: torch.cuda.set_device(opt.gpu) self.bpe = BPE(codecs.open(opt.bpe_codes, 'r', encoding="UTF-8"), opt.seprator, None, None) self.tokenizer = MosesTokenizer() self.detokenizer = MosesDetokenizer() self.translator = onmt.Translator(opt)
class TestBPESegmentMethod(unittest.TestCase): def setUp(self): with codecs.open(os.path.join(currentdir, 'data', 'bpe.ref'), encoding='utf-8') as bpefile: self.bpe = BPE(bpefile) self.infile = codecs.open(os.path.join(currentdir, 'data', 'corpus.en'), encoding='utf-8') self.reffile = codecs.open(os.path.join(currentdir, 'data', 'corpus.bpe.ref.en'), encoding='utf-8') def tearDown(self): self.infile.close() self.reffile.close() def test_apply_bpe(self): for line, ref in zip(self.infile, self.reffile): out = self.bpe.process_line(line) self.assertEqual(out, ref) def test_trailing_whitespace(self): """BPE.proces_line() preserves leading and trailing whitespace""" orig = ' iron cement \n' exp = ' ir@@ on c@@ ement \n' out = self.bpe.process_line(orig) self.assertEqual(out, exp) def test_utf8_whitespace(self): """UTF-8 whitespace is treated as normal character, not word boundary""" orig = 'iron\xa0cement\n' exp = 'ir@@ on@@ \xa0@@ c@@ ement\n' out = self.bpe.process_line(orig) self.assertEqual(out, exp) def test_empty_line(self): orig = '\n' exp = '\n' out = self.bpe.process_line(orig) self.assertEqual(out, exp)
class SplitWord(): def __init__(self, config): if "BPE" in config: if "BPE" in config["BPE"]: self.way = config["BPE"] if config["BPE"] == "BPE": self.bpe = BPE(codecs.open('D:/wiki_20180801/bpe.code', encoding='utf-8'), separator='') elif config["BPE"] == "BPE1000": self.bpe = BPE(codecs.open('D:/wiki_20180801/bpe1000.code', encoding='utf-8'), separator='') else: print("BPE define error") exit() else: self.way = config["BPE"] else: self.way = "Normal" def __call__(self, word): if self.way == "BPE": return self.bpe.process_line(word).split(" ") elif self.way == "Ngram": list_of_ngram = [] for i in range(3, 7): list_of_ngram.extend(ngram(word, i)) return list_of_ngram else: return word
class BPEService(object): def __init__(self,codes): self.bpe = BPE(codecs.open(codes,encoding='utf-8')) def process_line(self,line): return self.bpe.process_line(line.decode("UTF-8")).encode("UTF-8")
def __init__(self, config): self.config = config self.f_dict = config['vhred_dict'] # Load the VHRED model. self.model, self.enc_fn, self.dec_fn = self._build_vhred_model() # Load in Twitter dictionaries for BPE conversion. f_bpe_dictionary = config['vhred_bpe_file'] with open(f_bpe_dictionary, 'r') as handle: self.bpe = BPE(handle.readlines(), '@@') with open(self.f_dict, 'r') as handle: twitter_dict = cPickle.load(handle) self.str_to_idx = dict([(tok, tok_id) for tok, tok_id, _, _ in twitter_dict]) self.idx_to_str = dict([(tok_id, tok) for tok, tok_id, _, _ in twitter_dict]) self.MODELS = ['hred', 'human', 'tfidf', 'de']
def __init__(self, srclang, targetlang, sourcebpe=None, targetbpe=None, sourcespm=None, targetspm=None): self.bpe_source = None self.bpe_target = None self.sp_processor_source = None self.sp_processor_target = None self.sentences = [] # load BPE model for pre-processing if sourcebpe: # print("load BPE codes from " + sourcebpe, flush=True) BPEcodes = open(sourcebpe, 'r', encoding="utf-8") self.bpe_source = BPE(BPEcodes) if targetbpe: # print("load BPE codes from " + targetbpe, flush=True) BPEcodes = open(targetbpe, 'r', encoding="utf-8") self.bpe_target = BPE(BPEcodes) # load SentencePiece model for pre-processing if sourcespm: # print("load sentence piece model from " + sourcespm, flush=True) self.sp_processor_source = sentencepiece.SentencePieceProcessor() self.sp_processor_source.Load(sourcespm) if targetspm: # print("load sentence piece model from " + targetspm, flush=True) self.sp_processor_target = sentencepiece.SentencePieceProcessor() self.sp_processor_target.Load(targetspm) # pre- and post-processing tools self.tokenizer = None self.detokenizer = None # TODO: should we have support for other sentence splitters? # print("start pre- and post-processing tools") self.sentence_splitter = MosesSentenceSplitter(srclang) self.normalizer = MosesPunctuationNormalizer(srclang) if self.bpe_source: self.tokenizer = MosesTokenizer(srclang) if self.bpe_source: self.detokenizer = MosesDetokenizer(targetlang)
def __init__(self, config): if "BPE" in config: if "BPE" in config["BPE"]: self.way = config["BPE"] if config["BPE"] == "BPE": self.bpe = BPE(codecs.open('D:/wiki_20180801/bpe.code', encoding='utf-8'), separator='') elif config["BPE"] == "BPE1000": self.bpe = BPE(codecs.open('D:/wiki_20180801/bpe1000.code', encoding='utf-8'), separator='') else: print("BPE define error") exit() else: self.way = config["BPE"] else: self.way = "Normal"
def main(unused_argv): logging.set_verbosity(logging.INFO) if not gfile.IsDirectory(OutputPath('')): gfile.MakeDirs(OutputPath('')) bpe = BPE(codecs.open("code-file", encoding='utf-8'), "@@") wordMapPath = "word-map" tagMapPath = "tag-map" pMapPath = "prefix-list" sMapPath = "suffix-list" pMap = readAffix(pMapPath) sMap = readAffix(sMapPath) wordMap = readMap(wordMapPath) tagMap = readMap(tagMapPath) wordMap, _ = bpe.segment(wordMap) wordMap = list(set(process_seg_sent(wordMap))) wordMap.insert(0, "-start-") wordMap.insert(0, "-end-") wordMap.insert(0, "-unknown-") pMap.insert(0, "-start-") pMap.insert(0, "-unknown-") sMap.insert(0, "-start-") sMap.insert(0, "-unknown-") feature_sizes = [ 8, 8, 2, 4 ] #num of features for each feature group: capitalization, words, other, prefix_2, suffix_2, previous_tags domain_sizes = [3, len(wordMap) + 3, 3, len(tagMap) + 1] num_actions = 45 embedding_dims = [8, 64, 8, 16] train_data_path = '/cs/natlang-user/vivian/wsj-conll/train.conllu' dev_data_path = '/cs/natlang-user/vivian/wsj-conll/dev.conllu' logging.info("loading data and precomputing features...") train_data = ConllData(train_data_path, wordMap, tagMap, pMap, sMap, bpe) dev_data = ConllData(dev_data_path, wordMap, tagMap, pMap, sMap, bpe) with tf.Session(FLAGS.tf_master) as sess: Train(sess, num_actions, feature_sizes, domain_sizes, embedding_dims, wordMap, tagMap, pMap, sMap, train_data, dev_data, bpe)
class TestBPESegmentMethod(unittest.TestCase): def setUp(self): amock = mock.MagicMock() amock.readline.return_value = 'something' glossaries = ['like', 'Manuel', 'USA'] self.bpe = BPE(amock, glossaries=glossaries) def _run_test_case(self, test_case): orig, expected = test_case out = self.bpe.segment(orig) self.assertEqual(out, expected) def test_multiple_glossaries(self): orig = 'wordlikeword likeManuelword' exp = 'w@@ o@@ r@@ d@@ like@@ w@@ o@@ r@@ d l@@ i@@ k@@ e@@ M@@ a@@ n@@ u@@ e@@ l@@ word' test_case = (orig, exp) self._run_test_case(test_case)
class TestBPEIsolateGlossariesMethod(unittest.TestCase): def setUp(self): amock = mock.MagicMock() amock.readline.return_value = 'something' glossaries = ['like', 'Manuel', 'USA'] self.bpe = BPE(amock, glossaries=glossaries) def _run_test_case(self, test_case): orig, expected = test_case out = self.bpe._isolate_glossaries(orig) self.assertEqual(out, expected) def test_multiple_glossaries(self): orig = 'wordlikeUSAwordManuelManuelwordUSA' exp = ['word', 'like', 'USA', 'word', 'Manuel', 'Manuel', 'word', 'USA'] test_case = (orig, exp) self._run_test_case(test_case)
class TestBPEIsolateGlossariesMethod(unittest.TestCase): def setUp(self): amock = mock.MagicMock() amock.readline.return_value = 'something' glossaries = ['like', 'Manuel', 'USA'] self.bpe = BPE(amock, glossaries=glossaries) def _run_test_case(self, test_case): orig, expected = test_case out = self.bpe._isolate_glossaries(orig) self.assertEqual(out, expected) def test_multiple_glossaries(self): orig = 'wordlikeUSAwordManuelManuelwordUSA' exp = [ 'word', 'like', 'USA', 'word', 'Manuel', 'Manuel', 'word', 'USA' ] test_case = (orig, exp) self._run_test_case(test_case)
class TestBPESegmentMethod(unittest.TestCase): def setUp(self): amock = mock.MagicMock() amock.readline.return_value = 'something' glossaries = ['like', 'Manuel', 'USA'] self.bpe = BPE(amock, glossaries=glossaries) @mock.patch('apply_bpe.encode', side_effect=encode_mock) def _run_test_case(self, test_case, encode_function): orig, expected = test_case out = self.bpe.segment(orig) self.assertEqual(out, expected) def test_multiple_glossaries(self): orig = 'wordlikeword likeManuelword' exp = 'wo@@ rd@@ like@@ wo@@ rd like@@ Manuel@@ wo@@ rd' test_case = (orig, exp) self._run_test_case(test_case)
class TestRegexIsolateGlossaries(unittest.TestCase): def setUp(self): amock = mock.MagicMock() amock.readline.return_value = 'something' glossaries = ["<country>\w*</country>", "<name>\w*</name>", "\d+"] self.bpe = BPE(amock, glossaries=glossaries) def _run_test_case(self, test_case): orig, expected = test_case out = self.bpe._isolate_glossaries(orig) self.assertEqual(out, expected) def test_regex_glossaries(self): orig = 'wordlike<country>USA</country>word10001word<name>Manuel</name>word<country>USA</country>' exp = ['wordlike', '<country>USA</country>', 'word', '10001', 'word', '<name>Manuel</name>', 'word', '<country>USA</country>'] test_case = (orig, exp) self._run_test_case(test_case)
class E2C(object): def __init__(self, opt): self.opt = opt self.sep = opt.seprator + " " if opt.cuda: torch.cuda.set_device(opt.gpu) self.bpe = BPE(codecs.open(opt.bpe_codes, 'r', encoding="UTF-8"), opt.seprator, None, None) self.tokenizer = MosesTokenizer() self.detokenizer = MosesDetokenizer() self.translator = onmt.Translator(opt) def tokenDoc(self, doc): sentenceList = sent_tokenize(doc.strip()) print 'e2c sentenceList : ', sentenceList tokens = [] for sent in sentenceList: sent = sent.lower() sent = self.detokenizer.unescape_xml( self.tokenizer.tokenize(sent, return_str=True)) if self.opt.bpe_codes != "": sent = self.bpe.segment(sent).strip() token = sent.split() tokens += [token] return tokens def translate(self, doc): batch = self.tokenDoc(doc) pred, _, _, _, _ = self.translator.translate(batch, None) rstr = "" #ipdb.set_trace() for idx in range(len(pred)): rstr += ''.join(' '.join(pred[idx][0]).replace( self.sep, '').split()) + "\n\n" print 'e2c rstr : ', rstr.strip() return rstr.strip()
from onmt import model_builder model = model_builder.build_base_model(model_opt, fields, cur_device == 'gpu', checkpoint, gpu_id) model.to(cur_device) model.eval() from apply_bpe import BPE import codecs codes = codecs.open( "/scratch/project_2001970/AleModel/bpe-model.de-en-35k.wmt19-news-para.norm.tok.tc", encoding='utf-8') bpe = BPE(codes) def prepare(params, samples): #_, params.word2id = create_dictionary(samples) #params.word_vec = get_wordvec(PATH_TO_VEC, params.word2id) #params.wvec_dim = 300 import ipdb ipdb.set_trace() if params['save_embedds']: # get rid of empty lines samples = [sent if sent != [] else ['.'] for sent in samples] # apply BPE to batch sents = [] for sent in samples: str1 = ' '.join(sent)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-raw_dir', required=True) parser.add_argument('-data_dir', required=True) parser.add_argument('-codes', required=True) parser.add_argument('-save_data', required=True) parser.add_argument('-prefix', required=True) parser.add_argument('-max_len', type=int, default=100) parser.add_argument('--symbols', '-s', type=int, default=32000, help="Vocabulary size") parser.add_argument( '--min-frequency', type=int, default=6, metavar='FREQ', help='Stop if no symbol pair has frequency >= FREQ (default: %(default)s))') parser.add_argument('--dict-input', action="store_true", help="If set, input file is interpreted as a dictionary where each line contains a word-count pair") parser.add_argument( '--separator', type=str, default='@@', metavar='STR', help="Separator between non-final subword units (default: '%(default)s'))") parser.add_argument('--total-symbols', '-t', action="store_true") opt = parser.parse_args() # Create folder if needed. mkdir_if_needed(opt.raw_dir) mkdir_if_needed(opt.data_dir) # Download and extract raw data. raw_train = get_raw_files(opt.raw_dir, _TRAIN_DATA_SOURCES) raw_val = get_raw_files(opt.raw_dir, _VAL_DATA_SOURCES) raw_test = get_raw_files(opt.raw_dir, _TEST_DATA_SOURCES) # Merge files into one. train_src, train_trg = compile_files(opt.raw_dir, raw_train, opt.prefix + '-train') val_src, val_trg = compile_files(opt.raw_dir, raw_val, opt.prefix + '-val') test_src, test_trg = compile_files(opt.raw_dir, raw_test, opt.prefix + '-test') # Build up the code from training files if not exist opt.codes = os.path.join(opt.data_dir, opt.codes) if not os.path.isfile(opt.codes): sys.stderr.write(f"Collect codes from training data and save to {opt.codes}.\n") learn_bpe(raw_train['src'] + raw_train['trg'], opt.codes, opt.symbols, opt.min_frequency, True) sys.stderr.write(f"BPE codes prepared.\n") sys.stderr.write(f"Build up the tokenizer.\n") with codecs.open(opt.codes, encoding='utf-8') as codes: bpe = BPE(codes, separator=opt.separator) sys.stderr.write(f"Encoding ...\n") encode_files(bpe, train_src, train_trg, opt.data_dir, opt.prefix + '-train') encode_files(bpe, val_src, val_trg, opt.data_dir, opt.prefix + '-val') encode_files(bpe, test_src, test_trg, opt.data_dir, opt.prefix + '-test') sys.stderr.write(f"Done.\n") field = torchtext.data.Field( tokenize=str.split, lower=True, pad_token=Constants.PAD_WORD, init_token=Constants.BOS_WORD, eos_token=Constants.EOS_WORD) fields = (field, field) MAX_LEN = opt.max_len def filter_examples_with_length(x): return len(vars(x)['src']) <= MAX_LEN and len(vars(x)['trg']) <= MAX_LEN enc_train_files_prefix = opt.prefix + '-train' train = TranslationDataset( fields=fields, path=os.path.join(opt.data_dir, enc_train_files_prefix), exts=('.src', '.trg'), filter_pred=filter_examples_with_length) from itertools import chain field.build_vocab(chain(train.src, train.trg), min_freq=2) data = {'settings': opt, 'vocab': field, } opt.save_data = os.path.join(opt.data_dir, opt.save_data) print('[Info] Dumping the processed data to pickle file', opt.save_data) pickle.dump(data, open(opt.save_data, 'wb'))
response_list.append(train_responses_txt[ind]) print(train_contexts_txt[i]) print(response_list[i]) with open(output_file, 'w') as f1: for response in response_list: f1.write(response) if __name__ == '__main__': twitter_bpe_dictionary = '../TwitterData/BPE/Twitter_Codes_5000.txt' twitter_bpe_separator = '@@' twitter_model_dictionary = '../TwitterData/BPE/Dataset.dict.pkl' # Load in Twitter dictionaries twitter_bpe = BPE( open(twitter_bpe_dictionary, 'r').readlines(), twitter_bpe_separator) twitter_dict = pickle.load(open(twitter_model_dictionary, 'r')) twitter_str_to_idx = dict([(tok, tok_id) for tok, tok_id, _, _ in twitter_dict]) twitter_idx_to_str = dict([(tok_id, tok) for tok, tok_id, _, _ in twitter_dict]) # Get data, for Twitter train_file = '/home/ml/rlowe1/TwitterData/TwitterDataBPE/Train.dialogues.pkl' test_file = '/home/ml/rlowe1/TwitterData/TwitterDataBPE/Test.dialogues.pkl' output_file = './output.csv' with open(train_file) as f1: train_data = pickle.load(f1) with open(test_file) as f1: test_data = pickle.load(f1)
############################################################################### # # Main program # ############################################################################### parser = create_parser() args = parser.parse_args() # all models are grouped into one structure model = namedtuple("model", ["htable", "bpe", "net"]) print("\nLoading models") loaded = torch.load(args.model.name) model.htable = LoadHashTable(args.hash_table, args.verbose) model.bpe = BPE(args.bpe_codes, separator=args.separator) model.net = encoders.BLSTM(args.model.name, gpu=args.gpu, verbose=args.verbose) # all data structures are grouped into one structure data = namedtuple("data", [ "text_bpe", "text_slen", "text_bin", "text_enc" "file_bpe", "file_bin", "file_enc" "idx_pad", "idx_unk" ]) data.text_slen = np.empty(args.bsize, dtype=np.int32) data.text_bin = np.empty((args.max_len, args.bsize), dtype=np.int32) data.text_enc = np.empty((args.bsize, model.net.nembed), dtype=np.float32) \ if model.net else 0 data.idx_unk = model.htable['<UNK>'] data.idx_pad = model.htable['<PAD>']
class VHRED(object): def __init__(self, config): self.config = config self.f_dict = config['vhred_dict'] # Load the VHRED model. self.model, self.enc_fn, self.dec_fn = self._build_vhred_model() # Load in Twitter dictionaries for BPE conversion. f_bpe_dictionary = config['vhred_bpe_file'] with open(f_bpe_dictionary, 'r') as handle: self.bpe = BPE(handle.readlines(), '@@') with open(self.f_dict, 'r') as handle: twitter_dict = cPickle.load(handle) self.str_to_idx = dict([(tok, tok_id) for tok, tok_id, _, _ in twitter_dict]) self.idx_to_str = dict([(tok_id, tok) for tok, tok_id, _, _ in twitter_dict]) self.MODELS = ['hred', 'human', 'tfidf', 'de'] def _convert_text_to_bpe(self, contexts, gt_responses, model_responses, ignore_models=False): # Files needed for BPE conversions. context_ids = self._strs_to_idxs(contexts) gt_response_ids = self._strs_to_idxs(gt_responses) longest = 0 for res in gt_response_ids: if len(res) > longest: longest = len(res) print 'Longest Response:', longest if not ignore_models: model_response_ids = self._strs_to_idxs(model_responses) else: model_response_ids = None return context_ids, gt_response_ids, model_response_ids def _strs_to_idxs(self, data): out = [] for row in data: bpe_segmented = self.bpe.segment(row.strip()) out.append([ self.str_to_idx[word] for word in bpe_segmented.split() if word in self.str_to_idx ]) return out def _idxs_to_strs(self, data): out = [] for row in data: s = ' '.join([self.idx_to_str[word] for word in row]) out.append(s.replace('@@ ', '')) return out def _build_vhred_model(self): # Update the state dictionary. state = VHRED_prototype_state() model_prefix = self.config['vhred_prefix'] state_path = model_prefix + "_state.pkl" model_path = model_prefix + "_model.npz" with open(state_path, 'rb') as handle: state.update(cPickle.load(handle)) # Update the bs for the current data. state['bs'] = 100 state['dictionary'] = self.f_dict # Create the model: model = VHRED_DialogEncoderDecoder(state) model.bs = 100 enc_fn = model.build_encoder_function() dec_fn = model.build_decoder_encoding() return model, enc_fn, dec_fn def _extract_text(self, dataset, ignore_models=False): cs, gt_rs, m_rs = [], [], [] for entry in dataset: cs.append(entry['c']) gt_rs.append(entry['r_gt']) # Extract in this order so we don't mix up which responses came from which models. if not ignore_models: for m_name in self.MODELS: m_rs.append(entry['r_models'][m_name][0]) # Add </s> token to beginning of each. cs = [ '</s> ' + c.strip() if '</s> ' not in c[0:6] else c.strip() for c in cs ] gt_rs = [ '</s> ' + c.strip() if '</s> ' not in c[0:6] else c.strip() for c in gt_rs ] if not ignore_models: m_rs = [ '</s> ' + c.strip() if '</s> ' not in c[0:6] else c.strip() for c in m_rs ] return cs, gt_rs, m_rs # Compute model embeddings for contexts or responses # Embedding type can be 'CONTEXT' or 'DECODER' def _compute_embeddings(self, data): embeddings = [] context_ids_batch = [] batch_index = 0 batch_total = int(math.ceil(float(len(data)) / float(self.model.bs))) counter = 0 max_len = 0 for context_ids in data: counter += 1 context_ids_batch.append(context_ids) # If we have filled up a batch, or reached the end of our data: if len(context_ids_batch) == self.model.bs or counter == len(data): batch_index += 1 length = len(context_ids_batch) if len(context_ids_batch) < self.model.bs: # Pad the data to get a full batch. while len(context_ids_batch) < self.model.bs: context_ids_batch.append(context_ids_batch[0]) print 'Computing embeddings for batch %d/%d' % (batch_index, batch_total) encs = VHRED_compute_encodings(context_ids_batch, self.model, self.enc_fn, self.dec_fn, self.config['embedding_type']) if length < self.model.bs: encs = encs[:length] for i in range(len(encs)): embeddings.append(encs[i, :].tolist()) context_ids_batch = [] return embeddings def _add_embeddings_to_dataset(self, dataset, c_embs, r_gt_embs, r_model_embs, ignore_models=False): for ix in xrange(len(dataset)): dataset[ix]['c_emb'] = c_embs[ix] dataset[ix]['r_gt_emb'] = r_gt_embs[ix] if not ignore_models: dataset[ix]['r_model_embs'] = {} for jx, m_name in enumerate(self.MODELS): dataset[ix]['r_model_embs'][m_name] = r_model_embs[ ix * len(self.MODELS) + jx] return dataset def get_embeddings(self, dataset, new_models=None, ignore_models=False): ''' Dataset should be a list of dictionaries. Each dictionary should have keys: c, r_gt, r_models = {'model_name': [r, score, length], ...} ''' if not new_models is None: self.MODELS = new_models if 'r_models' not in dataset[0]: ignore_models = True contexts, gt_responses, model_responses = self._extract_text( dataset, ignore_models=ignore_models) context_ids, gt_response_ids, model_response_ids = self._convert_text_to_bpe( contexts, gt_responses, model_responses, ignore_models=ignore_models) print 'Computing context embeddings...' context_embs = self._compute_embeddings(context_ids) print 'Computing ground truth response embeddings...' gt_response_embs = self._compute_embeddings(gt_response_ids) if not ignore_models: print 'Computing model response embeddings...' model_response_embs = self._compute_embeddings(model_response_ids) else: model_response_embs = None # Update our dataset with each of the embeddings. dataset = self._add_embeddings_to_dataset(dataset, context_embs, gt_response_embs, model_response_embs, ignore_models=ignore_models) return dataset def use_saved_embeddings(self): with open(self.config['vhred_embeddings_file'], 'rb') as handle: dataset = cPickle.load(handle) return dataset
class ContentProcessor(): def __init__(self, srclang, targetlang, sourcebpe=None, targetbpe=None,sourcespm=None,targetspm=None): self.bpe_source = None self.bpe_target = None self.sp_processor_source = None self.sp_processor_target = None self.sentences=[] # load BPE model for pre-processing if sourcebpe: # print("load BPE codes from " + sourcebpe, flush=True) BPEcodes = open(sourcebpe, 'r', encoding="utf-8") self.bpe_source = BPE(BPEcodes) if targetbpe: # print("load BPE codes from " + targetbpe, flush=True) BPEcodes = open(targetbpe, 'r', encoding="utf-8") self.bpe_target = BPE(BPEcodes) # load SentencePiece model for pre-processing if sourcespm: # print("load sentence piece model from " + sourcespm, flush=True) self.sp_processor_source = sentencepiece.SentencePieceProcessor() self.sp_processor_source.Load(sourcespm) if targetspm: # print("load sentence piece model from " + targetspm, flush=True) self.sp_processor_target = sentencepiece.SentencePieceProcessor() self.sp_processor_target.Load(targetspm) # pre- and post-processing tools self.tokenizer = None self.detokenizer = None # TODO: should we have support for other sentence splitters? # print("start pre- and post-processing tools") self.sentence_splitter = MosesSentenceSplitter(srclang) self.normalizer = MosesPunctuationNormalizer(srclang) if self.bpe_source: self.tokenizer = MosesTokenizer(srclang) self.detokenizer = MosesDetokenizer(targetlang) def preprocess(self, srctxt): normalized_text = '\n'.join(self.normalizer(line) for line in srctxt.split('\n')) # normalizer do not accept '\n' sentSource = self.sentence_splitter([normalized_text]) self.sentences=[] for s in sentSource: if self.tokenizer: # print('raw sentence: ' + s, flush=True) tokenized = ' '.join(self.tokenizer(s)) # print('tokenized sentence: ' + tokenized, flush=True) segmented = self.bpe_source.process_line(tokenized) elif self.sp_processor_source: print('raw sentence: ' + s, flush=True) segmented = ' '.join(self.sp_processor_source.EncodeAsPieces(s)) # print(segmented, flush=True) else: raise RuntimeError("No tokenization / segmentation method defines, can't preprocess") self.sentences.append(segmented) return self.sentences def postprocess(self, recievedsentences): sentTranslated = [] for index, s in enumerate(recievedsentences): received = s.strip().split(' ||| ') # print(received, flush=True) # undo segmentation if self.bpe_source: translated = received[0].replace('@@ ','') elif self.sp_processor_target: translated = self.sp_processor_target.DecodePieces(received[0].split(' ')) else: translated = received[0].replace(' ','').replace('▁',' ').strip() alignment = '' if len(received) == 2: alignment = received[1] links = alignment.split(' ') fixedLinks = [] outputLength = len(received[0].split(' ')) for link in links: ids = link.split('-') if ids[0] != '-1' and int(ids[0])<len(self.sentences[index]): if int(ids[1])<outputLength: fixedLinks.append('-'.join(ids)) alignment = ' '.join(fixedLinks) if self.detokenizer: detokenized = self.detokenizer(translated.split()) else: detokenized = translated sentTranslated.append(detokenized) return sentTranslated
def setUp(self): amock = mock.MagicMock() amock.readline.return_value = 'something' glossaries = ['like', 'Manuel', 'USA'] self.bpe = BPE(amock, glossaries=glossaries)
def __init__(self, codes): self.bpe = BPE(codecs.open(codes, encoding='utf-8'))
def Eval(sess): """Builds and evaluates a network.""" logging.set_verbosity(logging.INFO) bpe = BPE(codecs.open("code-file", encoding='utf-8'), "@@") wordMapPath = "word-map" tagMapPath = "tag-map" pMapPath = "prefix-list" sMapPath = "suffix-list" pMap = readAffix(pMapPath) sMap = readAffix(sMapPath) wordMap = readMap(wordMapPath) tagMap = readMap(tagMapPath) wordMap, _ = bpe.segment(wordMap) wordMap = list(set(process_seg_sent(wordMap))) wordMap.insert(0, "-start-") wordMap.insert(0, "-end-") wordMap.insert(0, "-unknown-") pMap.insert(0, "-start-") pMap.insert(0, "-unknown-") sMap.insert(0, "-start-") sMap.insert(0, "-unknown-") feature_sizes = [ 8, 8, 2, 4 ] #num of features for each feature group: capitalization, words, other, prefix_2, suffix_2, previous_tags domain_sizes = [3, len(wordMap) + 3, 3, len(tagMap) + 1] num_actions = 45 embedding_dims = [8, 64, 8, 16] t = time.time() hidden_layer_sizes = map(int, FLAGS.hidden_layer_sizes.split(',')) logging.info( 'Building training network with parameters: feature_sizes: %s ' 'domain_sizes: %s', feature_sizes, domain_sizes) test_data_path = '/cs/natlang-user/vivian/wsj-conll/test.conllu' logging.info("loading data and precomputing features...") test_data = ConllData(test_data_path, wordMap, tagMap, pMap, sMap, bpe) tagger = GreedyTagger(num_actions, feature_sizes, domain_sizes, embedding_dims, hidden_layer_sizes, gate_gradients=True) tagger.AddEvaluation(FLAGS.batch_size) tagger.AddSaver() sess.run(tagger.inits.values()) tagger.saver.restore(sess, FLAGS.model_path) t = time.time() num_epochs = None num_tokens = 0 num_correct = 0 index = 0 epochs = 0 epochs, sent_batch = loadBatch(FLAGS.batch_size, epochs, test_data) while True: sent_batch, epochs, feature_endpoints, gold_tags, words = get_current_features( sent_batch, epochs, test_data, wordMap, tagMap, pMap, sMap) tf_eval_metrics = sess.run( tagger.evaluation['logits'], feed_dict={tagger.test_input: feature_endpoints}) for i in range(FLAGS.batch_size): best_action = 0 best_score = float("-inf") for j in range(45): if tf_eval_metrics[i][j] > best_score: best_score = tf_eval_metrics[i][j] best_action = j sent_batch[i].set_tag(tagMap[best_action]) if num_epochs is None: num_epochs = epochs elif num_epochs < sent_batch[0].get_epoch(): break test_data.reset_index() while test_data.has_next_sent(): sent = test_data.get_next_sent() output_tags = sent.get_tag_output() gold_tags = sent.origin_tag_list word_list, output_tags = combine_seg(sent.seg_word_list, output_tags) for idx, tag in enumerate(gold_tags): num_tokens += 1 if tag == output_tags[idx]: num_correct += 1 sent.reset_state() eval_metric = 0 if num_tokens == 0 else (100.0 * num_correct / num_tokens) logging.info( 'Number of Tokens: %d, Seconds elapsed in evaluation: %.2f, ' 'eval metric: %.2f%%', num_tokens, time.time() - t, eval_metric) logging.info('num correct tokens: %d', num_correct)
parser.add_argument('-c', '--cache', type=str, default='opusMT-cache.db', help='BPE model for source text segmentation') args = parser.parse_args() if not args.deftrg: args.deftrg = args.trglangs[0] ## load BPE model for pre-processing if args.bpe: print("load BPE codes from " + args.bpe, flush=True) BPEcodes = codecs.open(args.bpe, encoding='utf-8') bpe = BPE(BPEcodes) ## load SentencePiece model for pre-processing if args.spm: print("load sentence piece model from " + args.spm, flush=True) spm = spm.SentencePieceProcessor() spm.Load(args.spm) ## open the cache DB print("open cache at " + args.cache, flush=True) cache = SqliteDict(args.cache, autocommit=True) ## add signal handler for SIGINT to properly close ## the DB when interrupting def signal_handler(sig, frame):
def main(args): setup_logger(args) args.interactive = sys.stdin.isatty( ) and not args.file # Just make the code more understendable if args.file: data_descriptor = open(args.file, 'r') else: data_descriptor = sys.stdin if args.interactive: args.buffer_size = 1 if args.max_tokens is None and args.max_sentences is None: args.max_sentences = 1 if args.buffer_size > 50000: print( "WARNING: To prevent memory exhaustion buffer size is set to 50000", file=sys.stderr) args.buffer_size = 50000 assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert not args.max_sentences or args.max_sentences <= args.buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' print(args, file=sys.stderr) use_cuda = torch.cuda.is_available() and not args.cpu processing_start = time.time() # Load ensemble print('| loading model(s) from {}'.format(args.path), file=sys.stderr) model_paths = args.path.split(':') models, model_args, src_dict, tgt_dict = load_ensemble_for_inference( model_paths) if args.fp16: for model in models: model.half() # Optimize ensemble for generation for model in models: model.make_generation_fast_(need_attn=args.print_alignment) # Initialize generator translator = SequenceGenerator( models, tgt_dict.get_metadata(), maxlen=args.max_target_positions, beam_size=args.beam, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen, sampling=args.sampling, sampling_topk=args.sampling_topk, minlen=args.min_len, sampling_temperature=args.sampling_temperature) if use_cuda: translator.cuda() # Load BPE codes file if args.bpe_codes: codes = open(args.bpe_codes, 'r') bpe = BPE(codes) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) def make_result(src_str, hypos): result = Translation( src_str=src_str, hypos=[], pos_scores=[], alignments=[], ) # Process top predictions for hypo in hypos[:min(len(hypos), args.nbest)]: hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) hypo_str = tokenizer.Tokenizer.detokenize(hypo_str, 'de').strip() result.hypos.append((hypo['score'], hypo_str)) result.pos_scores.append('P\t' + ' '.join( f'{x:.4f}' for x in hypo['positional_scores'].tolist())) result.alignments.append('A\t' + ' '.join( str(utils.item(x)) for x in alignment) if args.print_alignment else None) return result gen_timer = StopwatchMeter() def process_batch(batch): tokens = batch.tokens lengths = batch.lengths if use_cuda: tokens = tokens.cuda() lengths = lengths.cuda() translation_start = time.time() gen_timer.start() translations = translator.generate( tokens, lengths, maxlen=int(args.max_len_a * tokens.size(1) + args.max_len_b), ) gen_timer.stop(sum(len(h[0]['tokens']) for h in translations)) dllogger.log(step='infer', data={'latency': time.time() - translation_start}) return [ make_result(batch.srcs[i], t) for i, t in enumerate(translations) ] if args.interactive: print('| Type the input sentence and press return:') for inputs in buffered_read(args.buffer_size, data_descriptor): indices = [] results = [] for batch, batch_indices in make_batches(inputs, args, src_dict, args.max_positions, bpe): indices.extend(batch_indices) results += process_batch(batch) for i in np.argsort(indices): result = results[i] print(result.src_str, file=sys.stderr) for hypo, pos_scores, align in zip(result.hypos, result.pos_scores, result.alignments): print(f'Score {hypo[0]}', file=sys.stderr) print(hypo[1]) print(pos_scores, file=sys.stderr) if align is not None: print(align, file=sys.stderr) if args.file: data_descriptor.close() log_dict = { 'throughput': 1. / gen_timer.avg, 'latency_avg': sum(gen_timer.intervals) / len(gen_timer.intervals), 'latency_p90': gen_timer.p(90), 'latency_p95': gen_timer.p(95), 'latency_p99': gen_timer.p(99), 'total_infernece_time': gen_timer.sum, 'total_run_time': time.time() - processing_start, } print('Translation time: {} s'.format(log_dict['total_infernece_time']), file=sys.stderr) print('Model throughput (beam {}): {} tokens/s'.format( args.beam, log_dict['throughput']), file=sys.stderr) print( 'Latency:\n\tAverage {:.3f}s\n\tp90 {:.3f}s\n\tp95 {:.3f}s\n\tp99 {:.3f}s' .format(log_dict['latency_avg'], log_dict['latency_p90'], log_dict['latency_p95'], log_dict['latency_p99']), file=sys.stderr) print('End to end time: {} s'.format(log_dict['total_run_time']), file=sys.stderr) dllogger.log(step=(), data=log_dict)
def main(models, saveto, bpe_file, save_alignment=None, k=5, normalize=False, n_process=5, chr_level=False, verbose=False, nbest=False, suppress_unk=False, a_json=False, print_word_probabilities=False, return_hyp_graph=False): # load model model_options options = [] for model in models: options.append(load_config(model)) fill_options(options[-1]) dictionaries = options[0]['dictionaries'] dictionaries_source = dictionaries[:-1] dictionary_target = dictionaries[-1] # load source dictionary and invert word_dicts = [] word_idicts = [] for dictionary in dictionaries_source: word_dict = load_dict(dictionary) if options[0]['n_words_src']: for key, idx in word_dict.items(): if idx >= options[0]['n_words_src']: del word_dict[key] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' word_dicts.append(word_dict) word_idicts.append(word_idict) # load target dictionary and invert word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # create input and output queues for processes # CAN I MAKE IT INTO SERVER ###### The following functions should be already a part of serverisation # utility function def _seqs2words(cc): ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) return ' '.join(ww) def _send_jobs(f, processes, queue): source_sentences = [] for idx, line in enumerate(f): if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = [] for w in words: w = [ word_dicts[i][f] if f in word_dicts[i] else 1 for (i, f) in enumerate(w.split('|')) ] if len(w) != options[0]['factors']: sys.stderr.write( 'Error: expected {0} factors, but input word has {1}\n' .format(options[0]['factors'], len(w))) for midx in xrange(n_process): processes[midx].terminate() sys.exit(1) x.append(w) x += [[0] * options[0]['factors']] queue.put((idx, x)) source_sentences.append(words) return idx + 1, source_sentences def _finish_processes(queue): for midx in xrange(n_process): queue.put(None) def _retrieve_jobs(n_samples, processes, queue, rqueue): trans = [None] * n_samples out_idx = 0 for idx in xrange(n_samples): resp = None while resp is None: try: resp = rqueue.get(True, 5) # if queue is empty after 5s, check if processes are still alive except Empty: for midx in xrange(n_process): if not processes[midx].is_alive(): # kill all other processes and raise exception if one dies queue.cancel_join_thread() rqueue.cancel_join_thread() for idx in xrange(n_process): processes[idx].terminate() sys.stderr.write( "Error: translate worker process {0} crashed with exitcode {1}" .format(processes[midx].pid, processes[midx].exitcode)) sys.exit(1) trans[resp[0]] = resp[1] if verbose and numpy.mod(idx, 10) == 0: sys.stderr.write('Sample {0} / {1} Done\n'.format((idx + 1), n_samples)) while out_idx < n_samples and trans[out_idx] != None: yield trans[out_idx] out_idx += 1 def _parallelized_main(fs_init, fs_next, c, bpe, tokenizer, detokenizer): source_file_t = sent_tokenize(c.recv(4096).decode('utf-8')) #print(source_file_t[i]) while source_file_t[0] != "EOT": for i in range(len(source_file_t)): # print source_file_t[i].decode('utf-8') #pipe = subprocess.Popen("echo " + source_file_t[i] + "| perl truecase.perl --model en-truecase.mdl", shell=True) #pipe = subprocess.Popen(["echo", '"' + source_file_t[i] + '"', "|", "perl", "truecase.perl", "--model", # "en-truecase.mdl"], stdout=subprocess.PIPE) #result = pipe.stdout.read() #print pipe.communicate() #print pipe #print pipe.stdout #print pipe.stdout.read() #print pipe. #print "Here" #print result #source_file_t[i] = subprocess.check_output() source_file_t[i] = bpe.segment( tokenizer.tokenize(source_file_t[i], return_str=True)).strip() #print "Passed" print source_file_t detokenized = '' queue = Queue() rqueue = Queue() processes = [None] * n_process for midx in xrange(n_process): processes[midx] = Process( target=translate_model, args=(queue, rqueue, midx, models, options, k, normalize, verbose, nbest, save_alignment is not None, suppress_unk, return_hyp_graph, fs_init, fs_next)) processes[midx].start() n_samples, source_sentences = _send_jobs(source_file_t, processes, queue) _finish_processes(queue) #### The model loading takes place in the head of for loop, prolly in _retrieve_jobs for i, trans in enumerate( _retrieve_jobs(n_samples, processes, queue, rqueue)): print "NEXT SENTENCE:" if nbest: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) order = numpy.argsort(scores) for j in order: if print_word_probabilities: probs = " ||| " + " ".join( "{0}".format(prob) for prob in word_probs[j]) else: probs = "" saveto.write('{0} ||| {1} ||| {2}{3}\n'.format( i, _seqs2words(samples[j]), scores[j], probs)) # print alignment matrix for each hypothesis # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos # translation_token_count+eos if save_alignment is not None: if a_json: print_matrix_json( alignment[j], source_sentences[i], _seqs2words(samples[j]).split(), i, i + j, save_alignment) else: save_alignment.write( '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n' .format(i, _seqs2words(samples[j]), scores[j], ' '.join(source_sentences[i]), len(source_sentences[i]) + 1, len(samples[j]))) print_matrix(alignment[j], save_alignment) else: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) ## TODO: Handle the output here #print((_seqs2words(samples) + "\n").encode('utf-8')) #text.append(_seqs2words(samples) + "\n") x = _seqs2words(samples) #print x[0].upper() + x[1:] detokenized += detokenizer.detokenize( (x.decode('utf-8') + " ").split(), return_str=True) detokenized = detokenized[0].upper() + detokenized[1:] #print "ref this" #print detokenized #detokenized[0] = detokenized[0].upper() #c.send(detokenized.replace('@@ ', '').encode('utf-8').strip()) ## TODO: End of output handling if print_word_probabilities: for prob in word_probs: saveto.write("{} ".format(prob)) saveto.write('\n') if save_alignment is not None: if a_json: print_matrix_json(alignment, source_sentences[i], _seqs2words(trans[0]).split(), i, i, save_alignment) else: save_alignment.write( '{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'. format(i, _seqs2words(trans[0]), 0, ' '.join(source_sentences[i]), len(source_sentences[i]) + 1, len(trans[0]))) print_matrix(alignment, save_alignment) c.send(detokenized.replace('@@ ', '').encode('utf-8').strip()) source_file_t = sent_tokenize(c.recv(4096).decode('utf-8')) c.close() sys.stderr.write('Done\n') def _listen(c, addr, fs_init, fs_next, tokenizer, detokenizer, bpe): while True: try: # Establish connection with client. try: print 'Got connection from', addr print "Receiving..." fname = c.recv(4096) except socket.error: c.close() print "connection closed" break print fname c.send("okay") #if fname == 'exit': # print "Terminating connection with client." # c.close() # break #else: #t = threading.Thread(target=_parallelized_main, args=(fname, fs_init, fs_next, c)) try: t = threading.Thread(target=_parallelized_main, args=(fs_init, fs_next, c, bpe, tokenizer, detokenizer)) t.start() t.join() except socket.error: c.close() break except KeyboardInterrupt as e: LOG.debug('Crtrl+C issued ...') LOG.info('Terminating server ...') try: c.shutdown(socket.SHUT_RDWR) c.close() except: pass break s = socket.socket() # Create a socket object host = socket.gethostname() # Get local machine name port = 12345 # Reserve a port for your service. s.bind((host, port)) # Bind to the port # Now wait for client connection. # Beginning model loading from theano_util import (load_params, init_theano_params) from nmt import (build_sampler) from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams from theano import shared trng = RandomStreams(1234) use_noise = shared(numpy.float32(0.)) fs_init = [] fs_next = [] for model, option in zip(models, options): # load model parameters and set theano shared variables param_list = numpy.load(model).files param_list = dict.fromkeys( [key for key in param_list if not key.startswith('adam_')], 0) params = load_params(model, param_list) tparams = init_theano_params(params) # word index f_init, f_next = build_sampler(tparams, option, use_noise, trng, return_alignment=save_alignment is not None) fs_init.append(f_init) fs_next.append(f_next) # end of model loading tokenizer = moses.MosesTokenizer() detokenizer = moses.MosesDetokenizer() # start listening to connections once models are loaded args.codes = codecs.open(bpe_file[0], encoding='utf-8') bpe = BPE(args.codes, '@@') while True: try: s.listen(5) print("Waiting for connections and stuff...") c, addr = s.accept() t = threading.Thread(target=_listen, args=(c, addr, fs_init, fs_next, tokenizer, detokenizer, bpe)) t.start() except KeyboardInterrupt: break s.close()
import kenlm from apply_bpe import BPE from common_text_features_functions import cut_xml if __name__ == "__main__": parser = argparse.ArgumentParser(description="Applying LM for the gazette.") parser.add_argument('-vw', help="VW file without calculated lm -> gazettetitle.without_lm.vw") parser.add_argument('--codes', '-c', type=argparse.FileType('r'), help="File with BPE codes (created by learn_bpe.py).") args = parser.parse_args() #Define lm pages_lm = kenlm.LanguageModel("LM/necrologies_lm.klm") necrologues_lm = kenlm.LanguageModel("LM/necrologies_lm.klm") bpe = BPE(args.codes, "@@") vw_file = args.vw file_name = os.path.basename(vw_file) gazette_title = vw_file.replace(file_name, "") with open(vw_file) as rectangles_to_check: for rectangle in rectangles_to_check.readlines(): page = re.search(r"PAGE:\d\d?", rectangle).group(0).replace("PAGE:","") x1 = re.search(r"X1:\d{1,4}", rectangle).group(0).replace("X1:","") x2 = re.search(r"X2:\d{1,4}", rectangle).group(0).replace("X2:","") y1 = re.search(r"Y1:\d{1,4}", rectangle).group(0).replace("Y1:","") y2 = re.search(r"Y2:\d{1,4}", rectangle).group(0).replace("Y2:","") xml_coord = gazette_title + "/page_" + page + ".xml_coord"
def setUp(self): amock = mock.MagicMock() amock.readline.return_value = 'something' glossaries = ["<country>\w*</country>", "<name>\w*</name>", "\d+"] self.bpe = BPE(amock, glossaries=glossaries)
def main(args): if args.buffer_size < 1: args.buffer_size = 1 if args.max_tokens is None and args.max_sentences is None: args.max_sentences = 1 assert not args.sampling or args.nbest == args.beam, \ '--sampling requires --nbest to be equal to --beam' assert not args.max_sentences or args.max_sentences <= args.buffer_size, \ '--max-sentences/--batch-size cannot be larger than --buffer-size' print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Setup task, e.g., translation task = tasks.setup_task(args) # Load ensemble print('| loading model(s) from {}'.format(args.path)) model_paths = args.path.split(':') models, model_args = utils.load_ensemble_for_inference( model_paths, task, model_arg_overrides=eval(args.model_overrides)) # Set dictionaries src_dict = task.source_dictionary tgt_dict = task.target_dictionary # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() # Initialize generator translator = SequenceGenerator( models, tgt_dict, beam_size=args.beam, stop_early=(not args.no_early_stop), normalize_scores=(not args.unnormalized), len_penalty=args.lenpen, unk_penalty=args.unkpen, sampling=args.sampling, sampling_topk=args.sampling_topk, minlen=args.min_len, sampling_temperature=args.sampling_temperature) if use_cuda: translator.cuda() # Load BPE codes file if args.bpe_codes: codes = open(args.bpe_codes, 'r') bpe = BPE(codes) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) def make_result(src_str, hypos): result = Translation( src_str='O\t{}'.format(src_str), hypos=[], pos_scores=[], alignments=[], ) # Process top predictions for hypo in hypos[:min(len(hypos), args.nbest)]: hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'].int().cpu() if hypo['alignment'] is not None else None, align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.remove_bpe, ) hypo_str = tokenizer.Tokenizer.detokenize(hypo_str, 'de') result.hypos.append('H\t{}\t{}'.format(hypo['score'], hypo_str)) result.pos_scores.append('P\t{}'.format(' '.join( map( lambda x: '{:.4f}'.format(x), hypo['positional_scores'].tolist(), )))) result.alignments.append('A\t{}'.format(' '.join( map(lambda x: str(utils.item(x)), alignment))) if args. print_alignment else None) return result gen_timer = StopwatchMeter() end2end_timer = StopwatchMeter() def process_batch(batch): tokens = batch.tokens lengths = batch.lengths if use_cuda: tokens = tokens.cuda() lengths = lengths.cuda() gen_timer.start() translations = translator.generate( tokens, lengths, maxlen=int(args.max_len_a * tokens.size(1) + args.max_len_b), ) gen_timer.stop() return [ make_result(batch.srcs[i], t) for i, t in enumerate(translations) ] if args.buffer_size > 1: print('| Sentence buffer size:', args.buffer_size) print('| Type the input sentence and press return:') for inputs in buffered_read(args.buffer_size): indices = [] results = [] end2end_timer.start() for batch, batch_indices in make_batches(inputs, args, src_dict, models[0].max_positions(), bpe): indices.extend(batch_indices) results += process_batch(batch) for i in np.argsort(indices): result = results[i] print(result.src_str) for hypo, pos_scores, align in zip(result.hypos, result.pos_scores, result.alignments): print(hypo) print(pos_scores) if align is not None: print(align) print('Model latency: {} s'.format(gen_timer.sum)) gen_timer.reset() end2end_timer.stop() print('End-to-end translation time: {} s'.format(end2end_timer.sum)) end2end_timer.reset()