Exemplo n.º 1
0
class TestBPESegmentMethod(unittest.TestCase):
    def setUp(self):

        with codecs.open(os.path.join(currentdir, 'data', 'bpe.ref'),
                         encoding='utf-8') as bpefile:
            self.bpe = BPE(bpefile)

        self.infile = codecs.open(os.path.join(currentdir, 'data',
                                               'corpus.en'),
                                  encoding='utf-8')
        self.reffile = codecs.open(os.path.join(currentdir, 'data',
                                                'corpus.bpe.ref.en'),
                                   encoding='utf-8')

    def tearDown(self):

        self.infile.close()
        self.reffile.close()

    def test_apply_bpe(self):

        for line, ref in zip(self.infile, self.reffile):
            out = self.bpe.process_line(line)
            self.assertEqual(out, ref)

    def test_trailing_whitespace(self):
        """BPE.proces_line() preserves leading and trailing whitespace"""

        orig = '  iron cement  \n'
        exp = '  ir@@ on c@@ ement  \n'

        out = self.bpe.process_line(orig)
        self.assertEqual(out, exp)

    def test_utf8_whitespace(self):
        """UTF-8 whitespace is treated as normal character, not word boundary"""

        orig = 'iron\xa0cement\n'
        exp = 'ir@@ on@@ \xa0@@ c@@ ement\n'

        out = self.bpe.process_line(orig)
        self.assertEqual(out, exp)

    def test_empty_line(self):

        orig = '\n'
        exp = '\n'

        out = self.bpe.process_line(orig)
        self.assertEqual(out, exp)
Exemplo n.º 2
0
class SplitWord():
    def __init__(self, config):
        if "BPE" in config:
            if "BPE" in config["BPE"]:
                self.way = config["BPE"]
                if config["BPE"] == "BPE":
                    self.bpe = BPE(codecs.open('D:/wiki_20180801/bpe.code',
                                               encoding='utf-8'),
                                   separator='')
                elif config["BPE"] == "BPE1000":
                    self.bpe = BPE(codecs.open('D:/wiki_20180801/bpe1000.code',
                                               encoding='utf-8'),
                                   separator='')
                else:
                    print("BPE define error")
                    exit()
            else:
                self.way = config["BPE"]
        else:
            self.way = "Normal"

    def __call__(self, word):
        if self.way == "BPE":
            return self.bpe.process_line(word).split(" ")
        elif self.way == "Ngram":
            list_of_ngram = []
            for i in range(3, 7):
                list_of_ngram.extend(ngram(word, i))
            return list_of_ngram
        else:
            return word
Exemplo n.º 3
0
class BPEService(object):

    def __init__(self,codes):
        self.bpe = BPE(codecs.open(codes,encoding='utf-8'))

    def process_line(self,line):
        return self.bpe.process_line(line.decode("UTF-8")).encode("UTF-8")
Exemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawTextHelpFormatter,
        description="subword-nmt: unsupervised word segmentation for neural machine translation and text generation ")
    subparsers = parser.add_subparsers(dest='command',
                                       help="""command to run. Run one of the commands with '-h' for more info.

learn-bpe: learn BPE merge operations on input text.
apply-bpe: apply given BPE operations to input text.
get-vocab: extract vocabulary and word frequencies from input text.
learn-joint-bpe-and-vocab: executes recommended workflow for joint BPE.""")

    learn_bpe_parser = create_learn_bpe_parser(subparsers)
    apply_bpe_parser = create_apply_bpe_parser(subparsers)
    get_vocab_parser = create_get_vocab_parser(subparsers)
    learn_joint_bpe_and_vocab_parser = create_learn_joint_bpe_and_vocab_parser(subparsers)

    args = parser.parse_args()

    if args.command == 'learn-bpe':
        # read/write files as UTF-8
        if args.input.name != '<stdin>':
            args.input = codecs.open(args.input.name, encoding='utf-8')
        if args.output.name != '<stdout>':
            args.output = codecs.open(args.output.name, 'w', encoding='utf-8')

        learn_bpe(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input)
    elif args.command == 'apply-bpe':
        # read/write files as UTF-8
        args.codes = codecs.open(args.codes.name, encoding='utf-8')
        if args.input.name != '<stdin>':
            args.input = codecs.open(args.input.name, encoding='utf-8')
        if args.output.name != '<stdout>':
            args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
        if args.vocabulary:
            args.vocabulary = codecs.open(args.vocabulary.name, encoding='utf-8')

        if args.vocabulary:
            vocabulary = read_vocabulary(args.vocabulary, args.vocabulary_threshold)
        else:
            vocabulary = None

        bpe = BPE(args.codes, args.merges, args.separator, vocabulary, args.glossaries)

        for line in args.input:
            args.output.write(bpe.process_line(line))

    elif args.command == 'get-vocab':
        if args.input.name != '<stdin>':
            args.input = codecs.open(args.input.name, encoding='utf-8')
        if args.output.name != '<stdout>':
            args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
        get_vocab(args.input, args.output)
    elif args.command == 'learn-joint-bpe-and-vocab':
        learn_joint_bpe_and_vocab(args)
    else:
        raise Exception('Invalid command provided')
Exemplo n.º 5
0
class ContentProcessor():
    def __init__(self,  srclang,
            targetlang, sourcebpe=None, targetbpe=None,sourcespm=None,targetspm=None):
        self.bpe_source = None
        self.bpe_target = None
        self.sp_processor_source = None
        self.sp_processor_target = None
        self.sentences=[]
        # load BPE model for pre-processing
        if sourcebpe:
            # print("load BPE codes from " + sourcebpe, flush=True)
            BPEcodes = open(sourcebpe, 'r', encoding="utf-8")
            self.bpe_source = BPE(BPEcodes)
        if targetbpe:
            # print("load BPE codes from " + targetbpe, flush=True)
            BPEcodes = open(targetbpe, 'r', encoding="utf-8")
            self.bpe_target = BPE(BPEcodes)

        # load SentencePiece model for pre-processing
        if sourcespm:
            # print("load sentence piece model from " + sourcespm, flush=True)
            self.sp_processor_source = sentencepiece.SentencePieceProcessor()
            self.sp_processor_source.Load(sourcespm)
        if targetspm:
            # print("load sentence piece model from " + targetspm, flush=True)
            self.sp_processor_target = sentencepiece.SentencePieceProcessor()
            self.sp_processor_target.Load(targetspm)

        # pre- and post-processing tools
        self.tokenizer = None
        self.detokenizer = None

        # TODO: should we have support for other sentence splitters?
        # print("start pre- and post-processing tools")
        self.sentence_splitter = MosesSentenceSplitter(srclang)
        self.normalizer = MosesPunctuationNormalizer(srclang)
        if self.bpe_source:
            self.tokenizer = MosesTokenizer(srclang)
            self.detokenizer = MosesDetokenizer(targetlang)

    def preprocess(self, srctxt):
        normalized_text = '\n'.join(self.normalizer(line) for line in srctxt.split('\n'))   # normalizer do not accept '\n'
        sentSource = self.sentence_splitter([normalized_text])
        self.sentences=[]
        for s in sentSource:
            if self.tokenizer:
                # print('raw sentence: ' + s, flush=True)
                tokenized = ' '.join(self.tokenizer(s))
                # print('tokenized sentence: ' + tokenized, flush=True)
                segmented = self.bpe_source.process_line(tokenized)
            elif self.sp_processor_source:
                print('raw sentence: ' + s, flush=True)
                segmented = ' '.join(self.sp_processor_source.EncodeAsPieces(s))
                # print(segmented, flush=True)
            else:
                raise RuntimeError("No tokenization / segmentation method defines, can't preprocess")
            self.sentences.append(segmented)
        return self.sentences

    def postprocess(self, recievedsentences):
        sentTranslated = []
        for index, s in enumerate(recievedsentences):
            received = s.strip().split(' ||| ')
            # print(received, flush=True)

            # undo segmentation
            if self.bpe_source:
                translated = received[0].replace('@@ ','')
            elif self.sp_processor_target:
                translated = self.sp_processor_target.DecodePieces(received[0].split(' '))
            else:
                translated = received[0].replace(' ','').replace('▁',' ').strip()

            alignment = ''
            if len(received) == 2:
                alignment = received[1]
                links = alignment.split(' ')
                fixedLinks = []
                outputLength = len(received[0].split(' '))
                for link in links:
                    ids = link.split('-')
                    if ids[0] != '-1' and int(ids[0])<len(self.sentences[index]):
                        if int(ids[1])<outputLength:
                            fixedLinks.append('-'.join(ids))
                alignment = ' '.join(fixedLinks)

            if self.detokenizer:
                detokenized = self.detokenizer(translated.split())
            else:
                detokenized = translated

            sentTranslated.append(detokenized)
        return sentTranslated
Exemplo n.º 6
0
Arquivo: train.py Projeto: yf1291/nlp4
def train_epoch(model, training_data, optimizer, opt, device, smoothing):
    ''' Epoch operation in training phase'''

    model.train()
    total_loss, n_word_total, n_word_correct = 0, 0, 0

    desc = '  - (Training)   '
    cnt = 0
    for batch in tqdm(training_data, mininterval=2, desc=desc, leave=False):

        if cnt > 0:
            break
        # prepare data
        src_seq = patch_src(batch.src, opt.src_pad_idx).to(device)
        trg_seq, gold = map(lambda x: x.to(device),
                            patch_trg(batch.trg, opt.trg_pad_idx))
        tgt_seq = trg_seq

        # forward
        optimizer.zero_grad()

        pred, atten_list = model(
            src_seq, tgt_seq)  # atten_list输出是6,batchsize*8 , tgt,src
        # pred 经过了展评.方便下面计算loss而已.
        # backward
        '''
        2020-07-09,21点52 下面就是我的核心算法.










        '''

        from pathlib import Path
        output_filedir = Path(
            __file__).resolve().parent / 'vocab_pair'  # 获取绝对路径的方法

        dic2 = {}
        with open(output_filedir, encoding='utf-8') as f:
            tmp = f.readlines()
            for i in tmp:
                i = i.strip('\n').split(':')
                dic2[i[0]] = i[1]
        check_dic = dic2
        tmmm = 1

        #2020-07-26,13点10 开始进行修改.
        config_src = training_data.dataset.fields['src'].vocab.itos
        config_tgt = training_data.dataset.fields['trg'].vocab.itos
        config_src2 = {}
        for i, j in enumerate(config_src):
            config_src2[j] = i
        config_src = config_src2  # 源语言wordpiece字典.

        config_tgt2 = {}
        for i, j in enumerate(config_tgt):
            config_tgt2[j] = i
        config_tgt = config_tgt2
        print(1111111111111111)

        # 下面还是在注意力机制的表里面找. 然后取出来求和即可.
        # 我需要现在获取vocab_pair里面对应的编码
        atten_out = []

        for i in check_dic:
            left = i
            right = check_dic[i]
            # 获取left,right编码

            with codecs.open(opt.codes, encoding='utf-8') as codes:
                bpe = BPE(codes, separator=opt.separator)
            tmp = bpe.process_line(left).split(' ')
            tmp2 = bpe.process_line(right).split(' ')
            print(11111111)
            try:
                left = [config_src[i] for i in tmp]
                right = [config_tgt[i] for i in tmp2]
            except:  # 如果发现vocab以外的,直接跳过attention计算
                continue

            # 否则就计算ttention
            print(left, right)

            # 下面碰到就加注意力 jike.
            # 后续可以考虑类似kmp算法来加速收缩  #src_seq,  tgt_seq
            for i2, (a, b) in enumerate(zip(src_seq, tgt_seq)):
                find_left_index = [
                    i for i in range(len(a)) if a[i:i + len(left)] == left
                ]
                find_right_index = [
                    i for i in range(len(b)) if b[i:i + len(right)] == right
                ]
                alldexleft = []
                alldexright = []
                for i in find_left_index:
                    for j in range(len(left)):
                        alldexleft.append(i + j)
                for i in find_right_index:
                    for j in range(len(right)):
                        alldexright.append(i + j)
                # alldexright = [range(i, i + len(left)) for i in find_right_index]
                print(alldexleft, alldexright)
                for left2 in alldexleft:
                    for right2 in alldexright:

                        atten_out.append(atten_list[i2, :, left2, right2])

        atten_out = torch.tensor(atten_out)
        atten_out = torch.flatten(atten_out)
        summy1 = []
        for i in atten_out:
            summy1.append((torch.tensor(i) - 0.9)**2)
        if len(summy1) == 0:
            summy1 = 0
        else:
            summy1 = torch.mean(summy1) * opt.alpha

        loss, n_correct, n_word = cal_performance(pred,
                                                  gold,
                                                  opt.trg_pad_idx,
                                                  smoothing=smoothing)

        loss = loss + summy1
        loss.backward()
        optimizer.step_and_update_lr()

        # note keeping
        n_word_total += n_word
        n_word_correct += n_correct
        total_loss += loss.item()
        cnt += 1

    loss_per_word = total_loss / n_word_total
    accuracy = n_word_correct / n_word_total
    return loss_per_word, accuracy