class TestBPESegmentMethod(unittest.TestCase): def setUp(self): with codecs.open(os.path.join(currentdir, 'data', 'bpe.ref'), encoding='utf-8') as bpefile: self.bpe = BPE(bpefile) self.infile = codecs.open(os.path.join(currentdir, 'data', 'corpus.en'), encoding='utf-8') self.reffile = codecs.open(os.path.join(currentdir, 'data', 'corpus.bpe.ref.en'), encoding='utf-8') def tearDown(self): self.infile.close() self.reffile.close() def test_apply_bpe(self): for line, ref in zip(self.infile, self.reffile): out = self.bpe.process_line(line) self.assertEqual(out, ref) def test_trailing_whitespace(self): """BPE.proces_line() preserves leading and trailing whitespace""" orig = ' iron cement \n' exp = ' ir@@ on c@@ ement \n' out = self.bpe.process_line(orig) self.assertEqual(out, exp) def test_utf8_whitespace(self): """UTF-8 whitespace is treated as normal character, not word boundary""" orig = 'iron\xa0cement\n' exp = 'ir@@ on@@ \xa0@@ c@@ ement\n' out = self.bpe.process_line(orig) self.assertEqual(out, exp) def test_empty_line(self): orig = '\n' exp = '\n' out = self.bpe.process_line(orig) self.assertEqual(out, exp)
class SplitWord(): def __init__(self, config): if "BPE" in config: if "BPE" in config["BPE"]: self.way = config["BPE"] if config["BPE"] == "BPE": self.bpe = BPE(codecs.open('D:/wiki_20180801/bpe.code', encoding='utf-8'), separator='') elif config["BPE"] == "BPE1000": self.bpe = BPE(codecs.open('D:/wiki_20180801/bpe1000.code', encoding='utf-8'), separator='') else: print("BPE define error") exit() else: self.way = config["BPE"] else: self.way = "Normal" def __call__(self, word): if self.way == "BPE": return self.bpe.process_line(word).split(" ") elif self.way == "Ngram": list_of_ngram = [] for i in range(3, 7): list_of_ngram.extend(ngram(word, i)) return list_of_ngram else: return word
class BPEService(object): def __init__(self,codes): self.bpe = BPE(codecs.open(codes,encoding='utf-8')) def process_line(self,line): return self.bpe.process_line(line.decode("UTF-8")).encode("UTF-8")
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, description="subword-nmt: unsupervised word segmentation for neural machine translation and text generation ") subparsers = parser.add_subparsers(dest='command', help="""command to run. Run one of the commands with '-h' for more info. learn-bpe: learn BPE merge operations on input text. apply-bpe: apply given BPE operations to input text. get-vocab: extract vocabulary and word frequencies from input text. learn-joint-bpe-and-vocab: executes recommended workflow for joint BPE.""") learn_bpe_parser = create_learn_bpe_parser(subparsers) apply_bpe_parser = create_apply_bpe_parser(subparsers) get_vocab_parser = create_get_vocab_parser(subparsers) learn_joint_bpe_and_vocab_parser = create_learn_joint_bpe_and_vocab_parser(subparsers) args = parser.parse_args() if args.command == 'learn-bpe': # read/write files as UTF-8 if args.input.name != '<stdin>': args.input = codecs.open(args.input.name, encoding='utf-8') if args.output.name != '<stdout>': args.output = codecs.open(args.output.name, 'w', encoding='utf-8') learn_bpe(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input) elif args.command == 'apply-bpe': # read/write files as UTF-8 args.codes = codecs.open(args.codes.name, encoding='utf-8') if args.input.name != '<stdin>': args.input = codecs.open(args.input.name, encoding='utf-8') if args.output.name != '<stdout>': args.output = codecs.open(args.output.name, 'w', encoding='utf-8') if args.vocabulary: args.vocabulary = codecs.open(args.vocabulary.name, encoding='utf-8') if args.vocabulary: vocabulary = read_vocabulary(args.vocabulary, args.vocabulary_threshold) else: vocabulary = None bpe = BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries) for line in args.input: args.output.write(bpe.process_line(line)) elif args.command == 'get-vocab': if args.input.name != '<stdin>': args.input = codecs.open(args.input.name, encoding='utf-8') if args.output.name != '<stdout>': args.output = codecs.open(args.output.name, 'w', encoding='utf-8') get_vocab(args.input, args.output) elif args.command == 'learn-joint-bpe-and-vocab': learn_joint_bpe_and_vocab(args) else: raise Exception('Invalid command provided')
class ContentProcessor(): def __init__(self, srclang, targetlang, sourcebpe=None, targetbpe=None,sourcespm=None,targetspm=None): self.bpe_source = None self.bpe_target = None self.sp_processor_source = None self.sp_processor_target = None self.sentences=[] # load BPE model for pre-processing if sourcebpe: # print("load BPE codes from " + sourcebpe, flush=True) BPEcodes = open(sourcebpe, 'r', encoding="utf-8") self.bpe_source = BPE(BPEcodes) if targetbpe: # print("load BPE codes from " + targetbpe, flush=True) BPEcodes = open(targetbpe, 'r', encoding="utf-8") self.bpe_target = BPE(BPEcodes) # load SentencePiece model for pre-processing if sourcespm: # print("load sentence piece model from " + sourcespm, flush=True) self.sp_processor_source = sentencepiece.SentencePieceProcessor() self.sp_processor_source.Load(sourcespm) if targetspm: # print("load sentence piece model from " + targetspm, flush=True) self.sp_processor_target = sentencepiece.SentencePieceProcessor() self.sp_processor_target.Load(targetspm) # pre- and post-processing tools self.tokenizer = None self.detokenizer = None # TODO: should we have support for other sentence splitters? # print("start pre- and post-processing tools") self.sentence_splitter = MosesSentenceSplitter(srclang) self.normalizer = MosesPunctuationNormalizer(srclang) if self.bpe_source: self.tokenizer = MosesTokenizer(srclang) self.detokenizer = MosesDetokenizer(targetlang) def preprocess(self, srctxt): normalized_text = '\n'.join(self.normalizer(line) for line in srctxt.split('\n')) # normalizer do not accept '\n' sentSource = self.sentence_splitter([normalized_text]) self.sentences=[] for s in sentSource: if self.tokenizer: # print('raw sentence: ' + s, flush=True) tokenized = ' '.join(self.tokenizer(s)) # print('tokenized sentence: ' + tokenized, flush=True) segmented = self.bpe_source.process_line(tokenized) elif self.sp_processor_source: print('raw sentence: ' + s, flush=True) segmented = ' '.join(self.sp_processor_source.EncodeAsPieces(s)) # print(segmented, flush=True) else: raise RuntimeError("No tokenization / segmentation method defines, can't preprocess") self.sentences.append(segmented) return self.sentences def postprocess(self, recievedsentences): sentTranslated = [] for index, s in enumerate(recievedsentences): received = s.strip().split(' ||| ') # print(received, flush=True) # undo segmentation if self.bpe_source: translated = received[0].replace('@@ ','') elif self.sp_processor_target: translated = self.sp_processor_target.DecodePieces(received[0].split(' ')) else: translated = received[0].replace(' ','').replace('▁',' ').strip() alignment = '' if len(received) == 2: alignment = received[1] links = alignment.split(' ') fixedLinks = [] outputLength = len(received[0].split(' ')) for link in links: ids = link.split('-') if ids[0] != '-1' and int(ids[0])<len(self.sentences[index]): if int(ids[1])<outputLength: fixedLinks.append('-'.join(ids)) alignment = ' '.join(fixedLinks) if self.detokenizer: detokenized = self.detokenizer(translated.split()) else: detokenized = translated sentTranslated.append(detokenized) return sentTranslated
def train_epoch(model, training_data, optimizer, opt, device, smoothing): ''' Epoch operation in training phase''' model.train() total_loss, n_word_total, n_word_correct = 0, 0, 0 desc = ' - (Training) ' cnt = 0 for batch in tqdm(training_data, mininterval=2, desc=desc, leave=False): if cnt > 0: break # prepare data src_seq = patch_src(batch.src, opt.src_pad_idx).to(device) trg_seq, gold = map(lambda x: x.to(device), patch_trg(batch.trg, opt.trg_pad_idx)) tgt_seq = trg_seq # forward optimizer.zero_grad() pred, atten_list = model( src_seq, tgt_seq) # atten_list输出是6,batchsize*8 , tgt,src # pred 经过了展评.方便下面计算loss而已. # backward ''' 2020-07-09,21点52 下面就是我的核心算法. ''' from pathlib import Path output_filedir = Path( __file__).resolve().parent / 'vocab_pair' # 获取绝对路径的方法 dic2 = {} with open(output_filedir, encoding='utf-8') as f: tmp = f.readlines() for i in tmp: i = i.strip('\n').split(':') dic2[i[0]] = i[1] check_dic = dic2 tmmm = 1 #2020-07-26,13点10 开始进行修改. config_src = training_data.dataset.fields['src'].vocab.itos config_tgt = training_data.dataset.fields['trg'].vocab.itos config_src2 = {} for i, j in enumerate(config_src): config_src2[j] = i config_src = config_src2 # 源语言wordpiece字典. config_tgt2 = {} for i, j in enumerate(config_tgt): config_tgt2[j] = i config_tgt = config_tgt2 print(1111111111111111) # 下面还是在注意力机制的表里面找. 然后取出来求和即可. # 我需要现在获取vocab_pair里面对应的编码 atten_out = [] for i in check_dic: left = i right = check_dic[i] # 获取left,right编码 with codecs.open(opt.codes, encoding='utf-8') as codes: bpe = BPE(codes, separator=opt.separator) tmp = bpe.process_line(left).split(' ') tmp2 = bpe.process_line(right).split(' ') print(11111111) try: left = [config_src[i] for i in tmp] right = [config_tgt[i] for i in tmp2] except: # 如果发现vocab以外的,直接跳过attention计算 continue # 否则就计算ttention print(left, right) # 下面碰到就加注意力 jike. # 后续可以考虑类似kmp算法来加速收缩 #src_seq, tgt_seq for i2, (a, b) in enumerate(zip(src_seq, tgt_seq)): find_left_index = [ i for i in range(len(a)) if a[i:i + len(left)] == left ] find_right_index = [ i for i in range(len(b)) if b[i:i + len(right)] == right ] alldexleft = [] alldexright = [] for i in find_left_index: for j in range(len(left)): alldexleft.append(i + j) for i in find_right_index: for j in range(len(right)): alldexright.append(i + j) # alldexright = [range(i, i + len(left)) for i in find_right_index] print(alldexleft, alldexright) for left2 in alldexleft: for right2 in alldexright: atten_out.append(atten_list[i2, :, left2, right2]) atten_out = torch.tensor(atten_out) atten_out = torch.flatten(atten_out) summy1 = [] for i in atten_out: summy1.append((torch.tensor(i) - 0.9)**2) if len(summy1) == 0: summy1 = 0 else: summy1 = torch.mean(summy1) * opt.alpha loss, n_correct, n_word = cal_performance(pred, gold, opt.trg_pad_idx, smoothing=smoothing) loss = loss + summy1 loss.backward() optimizer.step_and_update_lr() # note keeping n_word_total += n_word n_word_correct += n_correct total_loss += loss.item() cnt += 1 loss_per_word = total_loss / n_word_total accuracy = n_word_correct / n_word_total return loss_per_word, accuracy