예제 #1
0
def bpe_processing(filenames: list[str]):
    ru_sentences = []
    zh_sentences = []

    for filename in filenames:
        with open(filename, "r") as f:
            for line in f:
                ru, zh = line.split(" ||| ")
                ru_sentences.append(ru)
                zh_sentences.append(zh.replace(" ", ""))

    bpe = {}
    learn_bpe(StringIO("\n".join(ru_sentences)),
              open('bpe_rules.ru', 'w'),
              num_symbols=8000)
    bpe["ru"] = BPE(open('./bpe_rules.ru'))
    learn_bpe(StringIO("\n".join(zh_sentences)),
              open('bpe_rules.zh', 'w'),
              num_symbols=8000)
    bpe["zh"] = BPE(open('./bpe_rules.zh'))

    with open("token_map.txt", "w") as f:
        for ru, zh in zip(ru_sentences, zh_sentences):
            ru_tokens = bpe["ru"].process_line(ru.strip())
            zh_tokens = bpe["zh"].process_line(zh.strip())
            ru_map = ru_token_map(ru_tokens)
            zh_map = zh_token_map(zh_tokens)
            print(*ru_map, "|||", *zh_map, file=f)
            print(ru_tokens + " ||| " + zh_tokens)
예제 #2
0
    def finalize(self, frequencies, num_symbols=30000, minfreq=2):
        """Build the codecs.

        :param: dictionary of (token: frequency) pairs
        :param num_symbols: Number of BPE symbols. Recommend 30000-40000.
            If <= 0, default 30000 will be used.
        :param minfreq: Minimum frequency of a token before forced BPE
            decomposition. If <= 0 will use subword-nmt default of 2.
        """
        if hasattr(self, 'bpe'):
            # we already finalized the codecs
            return False

        print('Dictionary: saving bpe codecs to {}'.format(self.codecs))

        dictionary = ("{} {}".format(k, v) for k, v in frequencies.items())

        if num_symbols <= 0:
            num_symbols = 30000
        if minfreq <= 0:
            minfreq = 2
        with open(self.codecs, 'w') as outstream:
            learn_bpe.learn_bpe(
                dictionary,
                outstream,
                num_symbols=num_symbols,
                min_frequency=minfreq,
                is_dict=True,
            )

        self._load_from_codecs()
        return True
예제 #3
0
def subword_gen(infile, outfile, num_symbols):
    infile_stream = codecs.open(infile, encoding='utf-8')
    outfile_stream = codecs.open(outfile, 'w', encoding='utf-8')
    learn_bpe(infile_stream,
              outfile_stream,
              num_symbols,
              is_dict=False,
              total_symbols=True)
def build_vocab(imgs, params):
    # count up the number of words
    captions = []
    for img in imgs:
        for sent in img['sentences']:
            captions.append(' '.join(sent['tokens']))
    captions = '\n'.join(captions)
    all_captions = tempfile.NamedTemporaryFile(delete=False)
    all_captions.close()
    with open(all_captions.name, 'w') as txt_file:
        txt_file.write(captions)

    #
    codecs_output = tempfile.NamedTemporaryFile(delete=False)
    codecs_output.close()
    with codecs.open(codecs_output.name, 'w', encoding='UTF-8') as output:
        learn_bpe.learn_bpe(codecs.open(all_captions.name, encoding='UTF-8'),
                            output, params['symbol_count'])

    with codecs.open(codecs_output.name, encoding='UTF-8') as codes:
        bpe = apply_bpe.BPE(codes)

    tmp = tempfile.NamedTemporaryFile(delete=False)
    tmp.close()

    tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')

    for _, img in enumerate(imgs):
        img['final_captions'] = []
        for sent in img['sentences']:
            txt = ' '.join(sent['tokens'])
            txt = bpe.segment(txt).strip()
            img['final_captions'].append(txt.split(' '))
            tmpout.write(txt)
            tmpout.write('\n')
            if _ < 20:
                print(txt)

    tmpout.close()
    tmpin = codecs.open(tmp.name, encoding='UTF-8')

    vocab = learn_bpe.get_vocabulary(tmpin)
    vocab = sorted(vocab.keys(), key=lambda x: vocab[x], reverse=True)

    # Always insert UNK
    print('inserting the special UNK token')
    vocab.append('UNK')

    print('Vocab size:', len(vocab))

    os.remove(all_captions.name)
    with open(codecs_output.name, 'r') as codes:
        bpe = codes.read()
    os.remove(codecs_output.name)
    os.remove(tmp.name)

    return vocab, bpe
예제 #5
0
    def __learn(self):
        """
        Train a BPE.
        :trainfile: a file path which the model will learn.
        :codesfile: the output codes file.
        :num_symbols: number of vocabulary.
        :min_frequency: min frequency of the word.
        """
        trainfile = codecs.open(self.trainfile, encoding='utf-8')
        codesfile = codecs.open(self.codesfile, mode='w', encoding='utf-8')
        learn_bpe(trainfile, codesfile, self.num_symbols, self.min_frequency)

        self.__open_bpe()
예제 #6
0
def learn_bpe_function(raw_train_file, bpe_codes_file):
    parser = learn_bpe.create_parser()
    args = parser.parse_args(
        ["--input", raw_train_file, "--output", bpe_codes_file])
    args.output = codecs.open(args.output.name, 'w', encoding='utf-8')
    args.input = codecs.open(args.input.name, encoding='utf-8')
    learn_bpe.learn_bpe(args.input,
                        args.output,
                        args.symbols,
                        args.min_frequency,
                        args.verbose,
                        is_dict=args.dict_input,
                        total_symbols=args.total_symbols)
예제 #7
0
def build_wikitext_bpe_encoder(
        special_tokens: Optional[Sequence[str]] = None) -> BPEEncoder:

    regex_tokenizer = RegexTokenizer()

    def tokenize_with_regex(text: str) -> Iterable[str]:
        document = regex_tokenizer.apply(text)
        for token in document:
            if token.number:
                yield TOKEN_FOR_NUMBERS
            else:
                yield str(token)

    def wikitext_tokens(tokenizer: Callable[[str], Iterable[str]],
                        description: str):
        train_tokens = read_wikitext_file(TRAINING_SET_NAME)
        all_lines = train_tokens.splitlines()
        for line in tqdm.tqdm(all_lines, desc=description):
            yield from tokenizer(line)

    vocabulary_file = io.StringIO('\n'.join(
        '{} {}'.format(word, counter) for word, counter in build_vocabulary(
            wikitext_tokens(tokenize_with_regex, 'Building vocabulary'))))

    with io.StringIO() as file_with_merges:
        print('Learning BPE...', flush=True, end='')
        learn_bpe(vocabulary_file,
                  file_with_merges,
                  NUM_BPE_MERGES,
                  min_frequency=3,
                  verbose=False,
                  is_dict=True,
                  total_symbols=False)
        file_with_merges.seek(0)
        print('Done', flush=True)
        merges = BPEMerges.load_from_file(file_with_merges)

    bpe_tokenizer = BPETokenizer(merges,
                                 tokenize_with_regex,
                                 mark_sequence_edges=True)
    bpe_vocabulary = build_vocabulary(
        wikitext_tokens(bpe_tokenizer.apply, 'Building BPE vocabulary'))
    print('BPE Vocabulary size:', len(bpe_vocabulary))
    bpe_vocabulary_file = io.StringIO('\n'.join(
        '{} {}'.format(word, counter) for word, counter in bpe_vocabulary))
    bpe_encoder = BPEEncoder(bpe_tokenizer,
                             bpe_vocabulary_file,
                             special_tokens=special_tokens)

    return bpe_encoder
예제 #8
0
    def fix_file(self, corpus, pair, locale_code, src_file, target_file,
                 action):
        """learn and apply BPE on the whole file
        this will never change the actual file and always return an empty set
        """
        debug(f"  -[{pair}/{locale_code}]: processing {src_file} now!")
        learn_bpe(infile=open(src_file),
                  outfile=open(self.code_files[pair][locale_code], "w"),
                  verbose=self.verbose,
                  num_symbols=self.n_symbols,
                  min_frequency=self.min_frequency)
        copyfile(src_file, target_file)

        return set()
예제 #9
0
파일: bpe.py 프로젝트: vaibhavsagar9/ParlAI
    def finalize(self,
                 frequencies: Dict[str, int],
                 num_symbols: int = 30000,
                 minfreq: int = 2) -> bool:
        """
        Build the codecs.

        :param frequencies:
            dictionary of (token: frequency) pairs
        :param num_symbols:
            Number of BPE symbols. Recommend 30000-40000.  If <= 0, default
            30000 will be used.
        :param minfreq:
            Minimum frequency of a token before forced BPE decomposition. If <=
            0 will use subword-nmt default of 2.

        :return did_finalize:
            return whether codecs are finalized this call.
        """
        if hasattr(self, 'bpe'):
            # we already finalized the codecs
            return False

        logging.debug(f'Saving bpe codecs to {self.codecs}')

        dictionary = ("{} {}".format(k, v) for k, v in frequencies.items())

        if num_symbols <= 0:
            num_symbols = 30000
        if minfreq <= 0:
            minfreq = 2

        codec_dir, _ = os.path.split(self.codecs)
        PathManager.mkdirs(codec_dir)
        with PathManager.open(self.codecs, 'w', encoding='utf-8') as outstream:
            learn_bpe.learn_bpe(
                dictionary,
                outstream,
                num_symbols=num_symbols,
                min_frequency=minfreq,
                is_dict=True,
            )

        self._load_from_codecs()
        return True
예제 #10
0
def train_bpe(config):
    print('Start BPE training...')
    from subword_nmt.learn_bpe import main as learn_bpe
    train = json.load(open(config.train_file, 'r'))
    train_texts = []
    for p in train['data'][0]['paragraphs']:
        train_texts.append(
            preprocess_string(' '.join(word_tokenize(p['context'])),
                              remove_unicode=config.remove_unicode))
        for qas in p['qas']:
            train_texts.append(
                preprocess_string(' '.join(word_tokenize(qas['question'])),
                                  remove_unicode=config.remove_unicode))

    learn_bpe(train_texts,
              outfile=open(config.bpe_codes_file, 'w'),
              num_symbols=config.bpe_merges_count)
    print('BPE trained. BPE codes saved to {}'.format(config.bpe_codes_file))
예제 #11
0
def create_train_bpe(train_loc, bpe_voc=['en', 'ru'], num_symbols=10000):
    """
    args:
        train_loc: location of train.lang files with previously tokenized data
    returns:
        write train.bpe.lang files into train_loc
    """
    # build and apply bpe vocs
    bpe = {}
    for lang in bpe_voc:
        print("Learning BPE...")
        learn_bpe(open(train_loc + 'train.' + lang),
                  open(train_loc + 'bpe_rules.' + lang, 'w'),
                  num_symbols=num_symbols)
        bpe[lang] = BPE(open(train_loc + 'bpe_rules.' + lang))
        print("Writing train files...")
        with open(train_loc + 'train.bpe.' + lang, 'w') as f_out:
            for line in open(train_loc + 'train.' + lang):
                f_out.write(bpe[lang].process_line(line.strip()) + '\n')
예제 #12
0
def train_subword_model(src_text, trg_text, nb_symbols=10000):

    # create text content with source and target text
    content = []
    content.extend(src_text)
    content.extend(trg_text)

    bpe_model_io = io.StringIO()
    src_vocab_io = io.StringIO()
    trg_vocab_io = io.StringIO()

    # 1. Learn BPE model on both source and target text
    # 1.1 cat {train_file}.L1 {train_file}.L2 | subword-nmt learn-bpe -s {num_operations} -o {codes_file}
    # 1.2 subword-nmt apply-bpe -c {codes_file} < {train_file}.L1 | subword-nmt get-vocab > {vocab_file}.L1
    # 1.3 subword-nmt apply-bpe -c {codes_file} < {train_file}.L2 | subword-nmt get-vocab > {vocab_file}.L2

    # 1.1 learn_bpe(args.input, args.output, args.symbols, args.min_frequency, args.verbose, is_dict=args.dict_input, total_symbols=args.total_symbols))
    learn_bpe(content, bpe_model_io, nb_symbols, 0, False, False, False)

    # 1.2
    src_text_tok = apply_bpe(bpe_model_io, src_text, merges=nb_symbols)
    get_vocab(src_text_tok, src_vocab_io)
    src_vocab_io.seek(0)
    src_vocab = read_vocabulary(src_vocab_io, 0)
    # 1.3
    trg_text_tok = apply_bpe(bpe_model_io, trg_text, merges=nb_symbols)
    get_vocab(trg_text_tok, trg_vocab_io)
    trg_vocab_io.seek(0)
    trg_vocab = read_vocabulary(trg_vocab_io, 0)

    # 3. Re-apply BPE with the obtained vocabulary
    # subword-nmt apply-bpe -c {codes_file} --vocabulary {vocab_file}.L1 --vocabulary-threshold 50 < {train_file}.L1 > {train_file}.BPE.L1
    src_text_tok = apply_bpe(bpe_model_io, src_text, vocab=src_vocab)
    trg_text_tok = apply_bpe(bpe_model_io, trg_text, vocab=trg_vocab)

    bpe_model = bpe_model_io.getvalue()

    bpe_model_io.close()
    src_vocab_io.close()
    trg_vocab_io.close()

    return bpe_model, src_vocab, trg_vocab, src_text_tok, trg_text_tok
예제 #13
0
 def subword(self, cleaned_filepaths, overwrite):
     bpe_filepath = get_bpe_path(self.experiment_name, self.merge_ops)
     if self.corpora_type == 'training':
         # Concatenated file necessary for BPE learning
         concatenated_filepath = get_concat_path(self.file_prefix)
         concatenate_files(cleaned_filepaths,
                           concatenated_filepath,
                           overwrite=overwrite)
         if os.path.exists(bpe_filepath) and overwrite == False:
             print(bpe_filepath, 'already exists')
         else:
             print('Learning BPE encoding. This may take a while.')
             with open(concatenated_filepath, 'r',
                       encoding='utf-8') as infile, open(
                           bpe_filepath, 'w', encoding='utf-8') as outfile:
                 learn_bpe.learn_bpe(
                     infile, outfile, num_symbols=self.merge_ops
                 )  # Get codecs, write codecs to outfile
     print('Applying')
     with open(bpe_filepath, 'r', encoding='utf-8') as codec:
         bpe = apply_bpe.BPE(codec)
     print('Writing bpe')
     for i, lang in enumerate(self.langs):
         lang_filepath = cleaned_filepaths[i]
         processed_filepath = get_processed_data_path(
             self.experiment_name, self.corpora_type, lang)
         if overwrite == False and os.path.exists(processed_filepath):
             continue
         with open(lang_filepath, 'r',
                   encoding='utf-8') as f1, open(processed_filepath,
                                                 'w',
                                                 encoding='utf-8') as f2:
             for line in f1:
                 f2.write(bpe.process_line(line))
         if self.corpora_type == 'training':
             vocab_filepath = get_vocab_path(self.experiment_name, lang)
             with open(processed_filepath, 'r',
                       encoding='utf-8') as train_file, open(
                           vocab_filepath, 'w',
                           encoding='utf-8') as vocab_file:
                 get_vocab.get_vocab(train_file, vocab_file)
예제 #14
0
    def learn_bpe(self, item_list, bin_file=False, from_filenames=True):
        logging.info('generating bpe codes file. saving to %s' %
                     self.codes_file)
        if from_filenames:
            filenames = item_list
            if isinstance(filenames, str):
                filenames = [filenames]

            # get combined vocabulary of all input files
            full_vocab = OrderedCounter()
            if bin_file:
                for fname in filenames:
                    reader = open(fname, 'rb')
                    len_bytes = reader.read(8)
                    if not len_bytes: break  # finished reading this file
                    str_len = struct.unpack('q', len_bytes)[0]
                    example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0]
                    example_str = example_pb2.Example.FromString(example_str)
                    article_text = example_str.features.feature['article'].bytes_list.value[
                        0].decode()  # the article text was saved under the key 'article' in the data files
                    abstract_text = example_str.features.feature['abstract'].bytes_list.value[
                        0].decode()  # the abstract text was saved under the key 'abstract' in the data files
                    full_vocab += learn_bpe.get_vocabulary(abstract2sents(article_text) + abstract2sents(abstract_text))
            else:
                for fname in filenames:
                    with codecs.open(fname, encoding='UTF-8') as f:
                        full_vocab += learn_bpe.get_vocabulary(f)
        else:
            # get combined vocabulary of all input texts
            full_vocab = OrderedCounter()
            full_vocab += learn_bpe.get_vocabulary(item_list)

        vocab_list = ['{0} {1}'.format(key, freq)
                      for (key, freq) in full_vocab.items()]
        # learn BPE on combined vocabulary
        with codecs.open(self.codes_file, 'w', encoding='UTF-8') as output:
            learn_bpe.learn_bpe(vocab_list, output, self.num_symbols,
                           self.min_frequency, False, is_dict=True)
        self.set_bpe(self.codes_file)
예제 #15
0
    def finalize(self):
        """Build the codecs"""
        if self.built:
            return False

        self.built = True

        with open(self.codecs, 'w') as outstream:
            # There's a potentially more memory efficient way to do this, with
            # the is_dict method able to handle <word> \t <count> format.
            # It will require more sophisticated marshalling of data back and
            # forth
            learn_bpe.learn_bpe(
                self.training_data,
                outstream,
                num_symbols=self.num_symbols,
                min_frequency=self.minfreq,
                is_dict=False,
            )

        self._load_from_codecs()
        return True
예제 #16
0
def tokenize_corpus_hw(data='data.txt',
                       train_loc='./',
                       bpe_voc=['en', 'ru'],
                       num_symbols=10000):
    tokenizer = WordPunctTokenizer()
    with open(train_loc + 'train.' + bpe_voc[0], 'w') as f_src,  \
         open(train_loc + 'train.' + bpe_voc[1], 'w') as f_dst:
        for line in open(data):
            src_line, dst_line = line.strip().split('\t')
            f_src.write(tokenize(src_line, tokenizer) + '\n')
            f_dst.write(tokenize(dst_line, tokenizer) + '\n')

    # build and apply bpe vocs
    bpe = {}
    for lang in bpe_voc:
        learn_bpe(open(train_loc + 'train.' + lang),
                  open('bpe_rules.' + lang, 'w'),
                  num_symbols=num_symbols)
        bpe[lang] = BPE(open(train_loc + 'bpe_rules.' + lang))

        with open(train_loc + 'train.bpe.' + lang, 'w') as f_out:
            for line in open(train_loc + 'train.' + lang):
                f_out.write(bpe[lang].process_line(line.strip()) + '\n')
예제 #17
0
    def learn_bpe(self, item_list, from_filenames=True):
        logging.info('generating bpe codes file. saving to %s' %
                     self.codes_file)

        # get vocabulary at word level (before bpe)
        def segment_words(line):
            return _segment_words(line, self.pre_tokenize)

        vocab_words = _get_vocabulary(item_list,
                                      from_filenames=from_filenames,
                                      segment=segment_words)
        vocab_list = [
            '{0} {1}'.format(key, freq) for (key, freq) in vocab_words.items()
        ]
        # learn BPE on combined vocabulary
        with codecs.open(self.codes_file, 'w', encoding='UTF-8') as output:
            learn_bpe.learn_bpe(vocab_list,
                                output,
                                num_symbols=self.num_symbols,
                                min_frequency=self.min_frequency,
                                verbose=False,
                                is_dict=True,
                                total_symbols=self.total_symbols)
        self.set_bpe(self.codes_file)
예제 #18
0
    def __init__(self,
                 elements,
                 prune,
                 max_num,
                 start=True,
                 stop=True,
                 pad=True,
                 unk=True,
                 rule=False,
                 bpe=-1):
        self.start = start
        self.stop = stop
        self.codes = None
        vocab = Counter()
        self.max_num = max_num
        self.itos = []
        self.stoi = {}
        if pad:
            self.addSymbol('<blank>')
        if unk:
            self.addSymbol('<unk>')
        if start:
            self.addSymbol('<s>')
        if stop:
            self.addSymbol('</s>')
        self.rule = rule
        if rule:  # Adding these for both ATIS and CONCODE. Extra things in the vocab are ok.
            for pre_terminal in CDDataset.pre_terminal_symbols:
                self.addSymbol(CDDataset._unk_rule_from_Nt(pre_terminal))

        if bpe >= 0:
            self.codes = learn_bpe.learn_bpe(elements, bpe,
                                             0)  #  last is min freq
            b = apply_bpe.BPE(self.codes)
            elements = b.segment_tokens(elements)

        for w in elements:
            vocab[w] += 1
        if bpe >= 0:
            print('Vocab size {}'.format(len(vocab)))

        # prune low frequency words
        max_vocab = self.max_num if not rule else 100000000000
        for (w, f) in vocab.most_common(max_vocab):
            if ((rule == False and f > prune)
                    or (rule == True and not CDDataset._is_terminal_rule(w))
                    or (rule == True and CDDataset._is_terminal_rule(w)
                        and len(self.itos) < self.max_num)
                    or w.endswith("_concodeNT")):
                word = w.replace('concodeclass_',
                                 '').replace('concodefunc_', '')
                self.itos.append(word)
                self.stoi[word] = len(self.itos) - 1
            else:  #map everything else to unk
                if rule:
                    # We need the right kind of UNK rule here
                    mapped_to_known_unk = False
                    for pre_terminal in CDDataset.pre_terminal_symbols:
                        if pre_terminal in w:
                            self.stoi[w] = self.stoi[
                                CDDataset._unk_rule_from_Nt(pre_terminal)]
                            mapped_to_known_unk = True
                            break

                    if not mapped_to_known_unk:
                        # An unk type we dont know about. Investigate.
                        import ipdb
                        ipdb.set_trace()
                        # For next_rules, we cannot have any other type of unk
                        self.stoi[w] = self.stoi['<unk>']
                else:
                    self.stoi[w] = self.stoi['<unk>']
예제 #19
0
def get_dict(args):
    input_args = [
        args.train_prefix + '.' + args.source_lang,
        args.train_prefix + '.' + args.target_lang
    ]
    path = '/Users/chaofeng/atmt/assignment3/baseline/raw_data_back20000/'
    vocab_args = [
        path + "dict" + '.' + args.source_lang,
        path + "dict" + '.' + args.target_lang
    ]
    #input_args = [path+"train.de", path+"train.en"]
    #vocab_args = [path+"dict.de", path+"dict.en"]

    separator = '@@'
    symbols = 10000
    min_frequency = 1
    output = path + "code"

    # read/write files as UTF-8
    input = [codecs.open(f, encoding='UTF-8') for f in input_args]
    vocab = [codecs.open(f, mode='w', encoding='UTF-8') for f in vocab_args]
    # get combined vocabulary of all input texts
    full_vocab = Counter()
    for f in input:
        full_vocab += learn_bpe.get_vocabulary(f)
        f.seek(0)

    vocab_list = [
        '{0} {1}'.format(key, freq) for (key, freq) in full_vocab.items()
    ]

    # learn BPE on combined vocabulary
    with codecs.open(output, mode='w', encoding='UTF-8') as file:
        learn_bpe.learn_bpe(vocab_list, file, symbols, min_frequency)

    with codecs.open(output, encoding='UTF-8') as codes:
        bpe = apply_bpe.BPE(codes, separator=separator)

    # apply BPE to each training corpus and get vocabulary
    for train_file, vocab_file in zip(input, vocab):

        tmp = tempfile.NamedTemporaryFile(delete=False)
        tmp.close()

        tmpout = codecs.open(tmp.name, 'w', encoding='UTF-8')

        train_file.seek(0)
        for line in train_file:
            tmpout.write(bpe.segment(line).strip())
            tmpout.write('\n')

        tmpout.close()
        tmpin = codecs.open(tmp.name, encoding='UTF-8')

        vocab = learn_bpe.get_vocabulary(tmpin)
        tmpin.close()
        os.remove(tmp.name)

        for key, freq in sorted(vocab.items(),
                                key=lambda x: x[1],
                                reverse=True):
            vocab_file.write("{0} {1}\n".format(key, freq))
    return vocab_args, output
예제 #20
0
def main(argv):
    import argparse
    from io import StringIO

    # argument parsing
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description="adapts an UD dataset to context-sensitive lemmatization")

    io_group = parser.add_argument_group('io')
    io_group.add_argument("--input", help="file to be transformed", type=str)
    io_group.add_argument("--output",
                          help="output source and target files",
                          nargs='+',
                          type=str,
                          default=None)
    io_group.add_argument(
        "--transform_appendix",
        help=
        "appendix to transform folder name (e.g. SLURM_JOB_ID or datetime)",
        type=str,
        default=None)
    io_group.add_argument(
        "--word_column_index",
        help="index of word column in the file (zero-indexed)",
        type=int,
        default=0)
    io_group.add_argument(
        "--lemma_column_index",
        help="index of lemma column in the file (zero-indexed)",
        type=int,
        default=1)
    io_group.add_argument(
        "--tag_column_index",
        help="index of tag column in the file (zero-indexed)",
        type=int,
        default=2)
    io_group.add_argument('--debug',
                          dest='debug',
                          help="debug mode prints target/source file to stdout"
                          " instead of writing to the file system",
                          action='store_true')
    io_group.add_argument('--overwrite', dest='overwrite', action='store_true')
    io_group.add_argument(
        "--print_file",
        help="which file to output (source/target) in debug mode",
        choices=['source', 'target'],
        type=str,
        default=defaults["PRINT_FILE"])

    repr_group = parser.add_argument_group('representation')
    repr_group.add_argument(
        "--mode",
        help="mode of transformation",
        choices=['word_and_context', 'sentence_to_sentence'],
        type=str,
        default=defaults["MODE"])
    repr_group.add_argument("--word_unit",
                            help="type of word representation",
                            choices=['char', 'word', 'bpe'],
                            type=str,
                            default=defaults["WORD_UNIT"])
    repr_group.add_argument("--tag_unit",
                            help="type of tag representation",
                            choices=['char', 'word'],
                            type=str,
                            default=defaults["TAG_UNIT"])
    repr_group.add_argument("--context_unit",
                            help="type of context representation",
                            choices=['char', 'bpe', 'word'],
                            type=str,
                            default=defaults["CONTEXT_UNIT"])
    repr_group.add_argument(
        "--char_n_gram_mode",
        help="size of char-n-grams (only used if --context_unit is char"
        "or if --mode is sentence_to_sentence and --word_unit is char, default: %(default)s)",
        type=int,
        default=defaults["CHAR_N_GRAM"])
    repr_group.add_argument(
        "--sentence_size",
        help="maximum size of sentence in sentence_to_sentence mode",
        type=int,
        default=argparse.SUPPRESS)
    repr_group.add_argument('--tag_first',
                            action='store_true',
                            help="if true tags will be printed before "
                            "words in source and target files")

    ctx_group = parser.add_argument_group('context')
    ctx_group.add_argument(
        "--context_size",
        help=
        "size of context representation (in respective units) on left and right (0 to use full span)",
        type=int,
        default=defaults["CONTEXT_SIZE"])
    ctx_group.add_argument(
        "--context_char_size",
        help=
        "size of context representation (in characters) on left and right (0 to use full span, has precedence over --context_size)",
        type=int,
        default=argparse.SUPPRESS)
    ctx_group.add_argument(
        "--context_span",
        help=
        "maximum span of a word in number of sentences on left and right of the sentence of the word, default: %(default)s))",
        type=int,
        default=defaults["CONTEXT_SPAN"])
    ctx_group.add_argument(
        "--context_tags",
        help="whether and where to include tag in the context",
        choices=['none', 'left'],
        type=str,
        default=defaults["CONTEXT_TAGS"])

    bpe_group = parser.add_argument_group('bpe')
    bpe_group.add_argument(
        "--bpe_operations",
        help="number of BPE merge operations to be learned "
        "(corresponds to number of symbols/char-n-grams/codes)",
        type=int,
        default=defaults["BPE_OPERATIONS"])
    bpe_group.add_argument(
        "--bpe_codes_path",
        help=
        "full file path to export BPE codes to or to read them from if available",
        type=str,
        default=None)

    boundary_group = parser.add_argument_group('boundaries')
    boundary_group.add_argument(
        "--left_context_boundary",
        help="left context boundary special symbol (default: %(default)s)",
        type=str,
        default=defaults["LEFT_CONTEXT_BOUNDARY"])
    boundary_group.add_argument(
        "--example_boundary",
        help="example boundary special symbol (default: %(default)s)",
        type=str,
        default=defaults["EXAMPLE_BOUNDARY"])
    boundary_group.add_argument(
        "--right_context_boundary",
        help="right context boundary special symbol (default: %(default)s)",
        type=str,
        default=defaults["RIGHT_CONTEXT_BOUNDARY"])
    boundary_group.add_argument(
        "--word_boundary",
        help="word boundary special symbol (default: %(default)s)",
        type=str,
        default=defaults["WORD_BOUNDARY"])
    boundary_group.add_argument(
        "--tag_boundary",
        help="tag boundary special symbol (default: %(default)s)",
        type=str,
        default=defaults["TAG_BOUNDARY"])
    boundary_group.add_argument(
        '--subword_separator',
        type=str,
        default=defaults["SUBWORD_SEPARATOR"],
        metavar='STR',
        help=
        "separator between non-final BPE subword units (default: '%(default)s'))"
    )

    args = parser.parse_args(argv)

    # determining input
    if args.input is None:
        if args.output is None:
            raise ValueError(
                "Can't decide how to name the transformation because you feed from stdin. Use --output to specify path."
            )
        args.input = sys.stdin
    else:
        input_folders = re.split("/+|\\\\+", args.input)
        if len(input_folders) < 2:
            raise ValueError(
                "Can't decide how to name the transformation. Use --output to specify path."
            )
        else:
            input_folder = input_folders[-2]
            input_filename = input_folders[-1].split(".")[0]

    # determining output
    if not args.debug:
        if args.output is None or (type(args.output) is list
                                   and len(args.output) == 1):
            transform_folder = "{}_{}_{}{}{}{}{}{}{}{}{}".format(
                input_folder, "w" + args.word_unit, "t" + args.tag_unit,
                ("_" + (("{:02d}u".format(args.context_size))
                        if not hasattr(args, 'context_char_size') else
                        (str(args.context_char_size) + "ch")))
                if args.mode == 'word_and_context' else "",
                ("_" + ("c" + args.context_unit))
                if args.mode == "word_and_context" else "",
                "_n{}".format(args.char_n_gram_mode) if
                ((args.mode == 'word_and_context'
                  and args.context_unit == "char") or
                 (args.mode == 'sentence_to_sentence'
                  and args.word_unit == 'char')) else "",
                "_n{}".format(args.bpe_operations) if
                ((args.mode == 'word_and_context'
                  and args.context_unit == "bpe") or
                 (args.mode == 'sentence_to_sentence'
                  and args.word_unit == 'bpe')) else "",
                "_ct" if args.context_tags == 'left' else "",
                "_tf" if args.tag_first else "",
                "_cs{}".format(args.context_span)
                if args.mode == 'word_and_context' else "",
                ".{}".format(args.transform_appendix)
                if args.transform_appendix else "")

            if args.output is None or not args.output or '' in args.output:
                full_transform_folder_path = os.path.join(
                    os.path.dirname(os.path.abspath(__file__)), 'input',
                    args.mode, transform_folder)
            else:
                full_transform_folder_path = os.path.join(
                    args.output[0], transform_folder)

            os.makedirs(full_transform_folder_path, exist_ok=True)

            output_source_path = os.path.join(
                full_transform_folder_path, '{}_source'.format(input_filename))
            output_target_path = os.path.join(
                full_transform_folder_path, '{}_target'.format(input_filename))

            print(full_transform_folder_path)

            if not args.overwrite and (os.path.isfile(output_source_path)
                                       or os.path.isfile(output_target_path)):
                raise ValueError(
                    "Output files for {} already exist in {}. Pass --overwrite or delete them."
                    .format(input_filename, full_transform_folder_path))

            # truncate output files or create them anew
            open(output_source_path, 'w').close()
            open(output_target_path, 'w').close()
        else:
            if len(args.output) != 2:
                raise ValueError(
                    "You must specify full target and source output file paths (including file name)."
                )
            full_transform_folder_path = None
            output_source_path = args.output[0]
            output_target_path = args.output[1]

    print(args, file=sys.stderr)

    # loading file
    infile_df = preprocess_dataset_for_train(
        pd.read_csv(args.input,
                    sep='\s+',
                    names=cols,
                    usecols=[
                        args.word_column_index, args.lemma_column_index,
                        args.tag_column_index
                    ],
                    skip_blank_lines=False,
                    comment='#',
                    quoting=3)[cols])
    infile_df = infile_df.reset_index(drop=True)

    # subword preprocessing of the input file
    if (args.mode == 'word_and_context' and args.context_unit == 'char') or (
            args.mode == 'sentence_to_sentence' and args.word_unit == 'char'):
        # uses subword-nmt to segment text into chargrams
        from types import SimpleNamespace
        import numpy as np
        sys.path.append(
            os.path.join(os.path.dirname(__file__), '..', 'subword-nmt'))
        from subword_nmt.segment_char_ngrams import segment_char_ngrams

        def segment(col):
            subword_nmt_output = StringIO()
            segment_char_ngrams(
                SimpleNamespace(input=infile_df[col].dropna().astype(str),
                                vocab={},
                                n=args.char_n_gram_mode,
                                output=subword_nmt_output,
                                separator=args.subword_separator))
            subword_nmt_output.seek(0)
            infile_df.loc[infile_df[col].notnull(), [col]] = np.array([
                line.rstrip(' \t\n\r') for line in subword_nmt_output
            ])[:, np.newaxis]
            subword_nmt_output.truncate(0)

        segment("word")
        if args.mode == 'sentence_to_sentence' and args.word_unit == 'char':
            segment("lemma")
    elif (args.mode == 'word_and_context' and args.context_unit == 'bpe') or (
            args.mode == 'sentence_to_sentence' and args.word_unit == 'bpe'):
        if args.bpe_codes_path:
            bpe_codes_file_path = args.bpe_codes_path
        elif full_transform_folder_path:
            bpe_codes_file_path = os.path.join(full_transform_folder_path,
                                               "bpe_codes")
        else:
            raise ValueError(
                "Specify transformation output folder or bpe output file path in order to export BPE codes."
            )

        # BPE processing
        sys.path.append(
            os.path.join(os.path.dirname(__file__), '..', 'subword-nmt'))
        from subword_nmt.apply_bpe import BPE

        # only learn BPEs if bpe_codes file is unavailable
        if not os.path.isfile(bpe_codes_file_path):
            # as advised in subword-nmt's readme, we learn BPE jointly on the sources and targets
            # because they share an alphabet (for the most part)
            from subword_nmt.learn_bpe import learn_bpe
            bpe_codes = open(bpe_codes_file_path, "w", encoding='utf-8')
            learn_bpe(
                infile_df[["word", "lemma"]].dropna().astype(str).to_string(
                    index=False, header=False).splitlines(), bpe_codes,
                args.bpe_operations)
            bpe_codes.close()

        with open(bpe_codes_file_path, encoding='utf-8') as bpe_codes:
            # apply all merge operations, without vocabulary and glossaries
            bpe = BPE(bpe_codes, -1, args.subword_separator, [], [])
            infile_df.loc[infile_df["word"].notnull(), ["word", "lemma"]] = \
                infile_df.loc[infile_df["word"].notnull(), ["word", "lemma"]].applymap(bpe.process_line)

    sentence_indices = pd.isna(infile_df).all(axis=1)
    sentence_end_iterator = (i for i, e in sentence_indices.to_dict().items()
                             if e is True)

    # per-mode specific processing
    if args.mode == 'word_and_context':
        sentence_dfs = []
        transformer_args = {
            'word_unit':
            args.word_unit,
            'tag_unit':
            args.tag_unit,
            'context_size':
            args.context_size,
            'context_char_size':
            args.context_char_size
            if hasattr(args, 'context_char_size') else None,
            'context_tags':
            args.context_tags,
            'tag_first':
            args.tag_first,
            'left_context_boundary':
            args.left_context_boundary,
            'tag_boundary':
            args.tag_boundary,
            'right_context_boundary':
            args.right_context_boundary,
            'word_boundary':
            args.word_boundary,
            'example_boundary':
            args.example_boundary,
            'subword_separator':
            args.subword_separator
        }

        transformer = Transformer(**transformer_args)

        sentence_start = 0
        for sentence_end in sentence_end_iterator:
            sentence_dfs.append(infile_df.loc[sentence_start:sentence_end - 1])
            sentence_start = sentence_end + 1

        for sentence_df_idx, sentence_df in enumerate(sentence_dfs):
            # adds additional context according to CONTEXT_SPAN to sentence below
            lc_df = pd.DataFrame()
            rc_df = pd.DataFrame()

            if args.context_span > 0:
                lc_df_ls = sentence_dfs[
                    max(sentence_df_idx -
                        args.context_span, 0):sentence_df_idx]
                if lc_df_ls:
                    lc_df = pd.concat(lc_df_ls)

                rc_df_ls = sentence_dfs[
                    sentence_df_idx +
                    1:min(sentence_df_idx + 1 + args.context_span,
                          len(sentence_dfs) - 1)]
                if rc_df_ls:
                    rc_df = pd.concat(rc_df_ls)

            output_source_lines, output_target_lines = transformer.process_sentence(
                sentence_df, lc_df, rc_df)

            if not (output_source_lines or output_target_lines):
                continue

            if args.debug:
                if args.print_file == 'source':
                    print("\n".join(output_source_lines))
                else:
                    print("\n".join(output_target_lines))
            else:
                with open(output_source_path, 'a+', encoding='utf-8') as outsourcefile, \
                        open(output_target_path, 'a+', encoding='utf-8') as outtargetfile:
                    outsourcefile.write("\n".join(output_source_lines) + "\n")
                    outtargetfile.write("\n".join(output_target_lines) + "\n")
    elif args.mode == 'sentence_to_sentence':
        sentence_start = 0

        if args.example_boundary is not None:
            pos_close_tag = args.example_boundary.find('<') + 1
            open_tag = args.example_boundary
            close_tag = open_tag[:pos_close_tag] + '/' + open_tag[
                pos_close_tag:]

        for sentence_end in sentence_end_iterator:
            output_source_line = [open_tag]
            output_target_line = [open_tag]

            last_split_pos = 0

            for sentence_idx in range(sentence_start, sentence_end):
                subwords = re.split("\s*{}\s*".format(args.subword_separator),
                                    infile_df.at[sentence_idx, "word"])
                lemma = re.split("\s*{}\s*".format(args.subword_separator),
                                 infile_df.at[sentence_idx, "lemma"])
                tag = infile_df.at[sentence_idx, "tag"]

                # inserts a breaking point at the position before this word+tag were inserted in both the source and the target
                output_source_insertion_point = len(output_source_line)
                output_target_insertion_point = len(output_target_line)

                output_source_line.extend(subwords)
                output_source_line.append(args.word_boundary)

                if not args.tag_first:
                    output_target_line.extend(lemma)
                    output_target_line.append(args.tag_boundary)

                if args.tag_unit == "word":
                    output_target_line.append(tag)
                else:
                    output_target_line.extend(tag)

                if args.tag_first:
                    output_target_line.append(args.tag_boundary)
                    output_target_line.extend(lemma)

                output_target_line.append(args.word_boundary)

                # if the target translation overflows (target sentence is guaranteed to be longer in size)
                # sanity check: awk 'NF > 50 { print NR, NF }' dev_source | wc -l
                last_split_size = len(output_target_line) - last_split_pos
                if hasattr(args, 'sentence_size'
                           ) and last_split_size > args.sentence_size:
                    output_source_line.insert(output_source_insertion_point,
                                              defaults["SENTENCE_SPLIT_TAG"])
                    output_target_line.insert(output_target_insertion_point,
                                              defaults["SENTENCE_SPLIT_TAG"])
                    # set to 1, for internal slices to account for the opening <w> sentence boundary tag
                    last_split_pos = output_target_insertion_point

            sentence_start = sentence_end + 1

            output_source_line.pop()
            output_target_line.pop()

            output_source_line.append(close_tag)
            output_target_line.append(close_tag)

            assert output_source_line.count(defaults["SENTENCE_SPLIT_TAG"]) == output_target_line.count(defaults["SENTENCE_SPLIT_TAG"]), \
                "Sentence splits in sentence_to_sentence mode are wrong."

            # split sentences if necessary
            split_cond = True
            end_source_line_split_pos = 0
            end_target_line_split_pos = 0

            while split_cond:
                try:
                    start_source_line_pos = end_source_line_split_pos
                    start_target_line_pos = end_target_line_split_pos
                    end_source_line_split_pos = output_source_line.index(
                        defaults["SENTENCE_SPLIT_TAG"],
                        end_source_line_split_pos) + 1
                    end_target_line_split_pos = output_target_line.index(
                        defaults["SENTENCE_SPLIT_TAG"],
                        end_target_line_split_pos) + 1
                except ValueError:
                    split_cond = False
                    end_source_line_split_pos = len(output_source_line) + 1
                    end_target_line_split_pos = len(output_target_line) + 1

                output_source_line_split = output_source_line[
                    start_source_line_pos:end_source_line_split_pos - 1]
                output_target_line_split = output_target_line[
                    start_target_line_pos:end_target_line_split_pos - 1]

                if output_source_line_split[-1] == defaults["WORD_BOUNDARY"]:
                    output_source_line_split[-1] = close_tag
                if output_source_line_split[0] != open_tag:
                    output_source_line_split.insert(0, open_tag)

                if output_target_line_split[-1] == defaults["WORD_BOUNDARY"]:
                    output_target_line_split[-1] = close_tag
                if output_target_line_split[0] != open_tag:
                    output_target_line_split.insert(0, open_tag)

                if args.debug:
                    if args.print_file == 'source':
                        print(" ".join(output_source_line_split))
                    else:
                        print(" ".join(output_target_line_split))
                    print("\n")
                else:
                    with open(output_source_path, 'a+', encoding='utf-8') as outsourcefile, \
                            open(output_target_path, 'a+', encoding='utf-8') as outtargetfile:
                        outsourcefile.write(
                            " ".join(output_source_line_split) + "\n")
                        outtargetfile.write(
                            " ".join(output_target_line_split) + "\n")
예제 #21
0
def main(args):
    corpus_path_list = args.corpus
    if args.save_dir is None:
        args.save_dir = args.model
    for corpus_path in corpus_path_list:
        if not os.path.exists(corpus_path):
            raise ValueError(
                'The path="{}" provided by --corpus does not exist!'.format(
                    corpus_path))
    print('Learn the "{}"s subword model based on {}.'.format(
        args.model, args.corpus))
    os.makedirs(args.save_dir, exist_ok=True)
    model_prefix = os.path.join(args.save_dir, args.model)
    print('Save the subword model to {}.model'.format(model_prefix))
    print('Save the vocabulary to {}.vocab'.format(model_prefix))
    print()
    print('------- Start Training -------------')
    special_tokens_kv = OrderedDict()
    if not args.disable_unk:
        special_tokens_kv['unk_token'] = Vocab.UNK_TOKEN
    if not args.disable_bos:
        special_tokens_kv['bos_token'] = Vocab.BOS_TOKEN
    if not args.disable_eos:
        special_tokens_kv['eos_token'] = Vocab.EOS_TOKEN
    if not args.disable_pad:
        special_tokens_kv['pad_token'] = Vocab.PAD_TOKEN
    # split custom special tokens
    if args.model in ['yttm'] and len(args.custom_special_tokens) > 0:
        raise ValueError(
            'model {} do not support custom_special_tokens'.format(args.model))
    additional_custom_special_token = OrderedDict()
    for custom_special_token in args.custom_special_tokens:
        kv = custom_special_token.split('=')
        if not len(kv) == 2:
            raise ValueError(
                'parameter {} has wrong format'.format(custom_special_token))
        k, v = kv[0], kv[1]
        if k in special_tokens_kv:
            warnings.warn(
                f'There are overlaps between the custom special tokens and the'
                f' unk, bos, eos, pad tokens. Currently, we will overwrite the '
                f'default tokens. We will overwrite "{k}" to "{v}"')
        special_tokens_kv[k] = v
        additional_custom_special_token[k] = v
    if args.model == 'hf_wordpiece':
        tokenizers = try_import_huggingface_tokenizers()
        if 'unk_token' not in special_tokens_kv or special_tokens_kv[
                'unk_token'] != '[UNK]':
            # TODO, HF Tokenizer must have the unk token.
            special_tokens_kv['unk_token'] = '[UNK]'
        if parse_version(tokenizers.__version__) < parse_version('0.8'):
            # The older version of Tokenizers
            # hf_wordpiece must contain mask, cls and sep tokens
            # the custom defined mask,cls,sep can overwrite the default settings
            if 'mask_token' not in special_tokens_kv:
                special_tokens_kv['mask_token'] = Vocab.MASK_TOKEN
            if 'cls_token' not in special_tokens_kv:
                special_tokens_kv['cls_token'] = Vocab.CLS_TOKEN
            if 'sep_token' not in special_tokens_kv:
                special_tokens_kv['sep_token'] = Vocab.SEP_TOKEN
    special_tokens = list(special_tokens_kv.values())
    print('special tokens: ' + ', '.join(special_tokens))
    vocab = []
    if args.model == 'spm':
        try_import_sentencepiece()
        import sentencepiece as spm
        corpus_path = ','.join(corpus_path_list)
        script = '--input={} --model_prefix={} --vocab_size={} --character_coverage={} --input_sentence_size={}' \
                 .format(corpus_path, model_prefix, args.vocab_size, args.coverage, args.input_sentence_size)
        script += (' --unk_id=' +
                   str(list(special_tokens_kv.keys()).index('unk_token')))
        script += (' --bos_id=' + ('-1' if args.disable_bos else str(
            list(special_tokens_kv.keys()).index('bos_token'))))
        script += (' --eos_id=' + ('-1' if args.disable_eos else str(
            list(special_tokens_kv.keys()).index('eos_token'))))
        script += (' --pad_id=' + ('-1' if args.disable_pad else str(
            list(special_tokens_kv.keys()).index('pad_token'))))
        if len(additional_custom_special_token) > 0:
            script += (
                ' --control_symbols=' +
                ','.join(list(additional_custom_special_token.values())))
        print(script)
        spm.SentencePieceTrainer.Train(script)
        if 'bos_token' in special_tokens_kv:
            special_tokens_kv['bos_token'] = '<s>'
        if 'eos_token' in special_tokens_kv:
            special_tokens_kv['eos_token'] = '</s>'
        # build spm vocab
        spm_model = spm.SentencePieceProcessor()
        spm_model.load(model_prefix + '.model')
        vocab = [spm_model.id_to_piece(i) for i in range(len(spm_model))]
        os.remove(model_prefix + '.vocab')
    elif args.model == 'subword_nmt':
        try_import_subword_nmt()
        from subword_nmt import learn_bpe
        corpus_path = cat_corpus(corpus_path_list)\
            if len(corpus_path_list) > 1 else corpus_path_list[0]
        # build model
        with open(corpus_path, 'r', encoding='utf-8') as fc,\
             open(model_prefix + '.model', 'w', encoding='utf-8') as fm:
            learn_bpe.learn_bpe(fc,
                                fm,
                                args.vocab_size - len(special_tokens),
                                total_symbols=True)
        # build vocab
        with open(corpus_path, 'r', encoding='utf-8') as fc, \
             open(model_prefix + '.model', 'r', encoding='utf-8') as fm:
            vocab.extend(special_tokens)
            uniq_chars_internal = set()
            uniq_chars_final = set()
            uniq_words = set()
            for line in fc:
                for word in line.strip('\r\n ').split(' '):
                    if word:
                        uniq_words.add(word)
            # this code piece is same as
            # https://github.com/rsennrich/subword-nmt/blob/master/subword_nmt/learn_bpe.py shows
            uniq_words = [
                tuple(x[:-1]) + (x[-1] + '</w>', ) for x in uniq_words
            ]
            for word in uniq_words:
                for char in word[:-1]:
                    uniq_chars_internal.add(char)
                uniq_chars_final.add(word[-1])
            # sort to ensure the same settings produce the same vocab
            vocab.extend(sorted(list(uniq_chars_internal)))
            vocab.extend(sorted(list(uniq_chars_final)))
            fm.readline()
            pair = fm.readline()
            while pair:
                vocab.append(pair.replace(' ', '', 1).strip())
                pair = fm.readline()
        if len(corpus_path_list) > 1:
            os.remove(corpus_path)
    elif args.model == 'yttm':
        try_import_yttm()
        import youtokentome as yttm
        corpus_path = cat_corpus(corpus_path_list)\
            if len(corpus_path_list) > 1 else corpus_path_list[0]
        tokenizer = yttm.BPE.train(
            data=corpus_path,
            model=model_prefix + '.model',
            vocab_size=args.vocab_size,
            coverage=args.coverage,
            n_threads=args.n_threads,
            unk_id=special_tokens.index(Vocab.UNK_TOKEN),
            bos_id=-1
            if args.disable_bos else special_tokens.index(Vocab.BOS_TOKEN),
            eos_id=-1
            if args.disable_eos else special_tokens.index(Vocab.EOS_TOKEN),
            pad_id=-1
            if args.disable_pad else special_tokens.index(Vocab.PAD_TOKEN))
        vocab = tokenizer.vocab()
        if 'unk_token' in special_tokens_kv:
            special_tokens_kv['unk_token'] = '<UNK>'
        if 'bos_token' in special_tokens_kv:
            special_tokens_kv['bos_token'] = '<BOS>'
        if 'eos_token' in special_tokens_kv:
            special_tokens_kv['eos_token'] = '<EOS>'
        if 'pad_token' in special_tokens_kv:
            special_tokens_kv['pad_token'] = '<PAD>'
        if len(corpus_path_list) > 1:
            os.remove(corpus_path)
    elif args.model in ['hf_bpe', 'hf_bytebpe', 'hf_wordpiece']:
        tokenizers = try_import_huggingface_tokenizers()
        if args.model == 'hf_bpe':
            split_on_whitespace_only = not args.split_punctuation
            tokenizer = tokenizers.CharBPETokenizer(
                lowercase=args.lowercase,
                bert_normalizer=args.bert_normalizer,
                split_on_whitespace_only=split_on_whitespace_only)
        elif args.model == 'hf_bytebpe':
            tokenizer = tokenizers.ByteLevelBPETokenizer(
                lowercase=args.lowercase)
        elif args.model == 'hf_wordpiece':
            unk_token = special_tokens_kv.get('unk_token', None)
            sep_token = special_tokens_kv.get('sep_token', None)
            cls_token = special_tokens_kv.get('cls_token', None)
            pad_token = special_tokens_kv.get('pad_token', None)
            mask_token = special_tokens_kv.get('mask_token', None)
            if args.bert_normalizer:
                strip_accents = None
                clean_text = True
                handle_chinese_chars = True
            else:
                strip_accents = False
                clean_text = False
                handle_chinese_chars = False
            tokenizer = tokenizers.BertWordPieceTokenizer(
                unk_token=unk_token,
                sep_token=sep_token,
                cls_token=cls_token,
                pad_token=pad_token,
                mask_token=mask_token,
                lowercase=args.lowercase,
                strip_accents=strip_accents,
                handle_chinese_chars=handle_chinese_chars,
                clean_text=clean_text)
        else:
            raise NotImplementedError
        tokenizer.train(corpus_path_list,
                        vocab_size=args.vocab_size,
                        show_progress=True,
                        special_tokens=special_tokens)
        # Deal with the API change of tokenizers >= 0.8
        if version.parse(tokenizers.__version__) >= version.parse('0.8'):
            save_model_path = model_prefix + '.model'
            tokenizer.save(save_model_path)
            model_info = json.load(open(save_model_path, encoding='utf-8'))
            special_tokens_in_tokenizer = model_info['added_tokens']
            assert len(special_tokens_in_tokenizer) == len(special_tokens)
            hf_vocab = model_info['model']['vocab']
            hf_vocab_sorted = sorted(list(hf_vocab.items()),
                                     key=lambda x: x[1])
            hf_vocab_ids = [ele[1] for ele in hf_vocab_sorted]
            assert min(hf_vocab_ids) == 0 and max(
                hf_vocab_ids) == len(hf_vocab_ids) - 1
            vocab = [ele[0] for ele in hf_vocab_sorted]
        else:
            tokenizer.save(args.save_dir, args.model)
            # we replace the huggingface vocab file with our Vocab implementation
            if args.model == 'hf_wordpiece':
                hf_vocab_file = model_prefix + '-vocab.txt'
                with open(hf_vocab_file, 'r', encoding='utf-8') as fv:
                    for line in fv:
                        vocab.append(line.strip())
            else:
                # Move the hf_${model}-merges.txt to hf_${model}.models
                os.rename(
                    os.path.join(args.save_dir,
                                 '{}-merges.txt'.format(args.model)),
                    os.path.join(args.save_dir, '{}.model'.format(args.model)))
                hf_vocab_file = model_prefix + '-vocab.json'
                with open(hf_vocab_file, 'r', encoding='utf-8') as fv:
                    vocab_kv = json.load(fv)
                    vocab_kv = sorted(list(vocab_kv.items()),
                                      key=lambda x: x[1])
                    for kv in vocab_kv:
                        vocab.append(kv[0])
            os.remove(hf_vocab_file)
    else:
        raise NotImplementedError
    vocab_obj = Vocab(vocab, **special_tokens_kv)
    vocab_obj.save(model_prefix + '.vocab')
    print('-------- Done Training -------------')
예제 #22
0
def create_corpus(src,
                  trg,
                  en_test_sents,
                  filedir,
                  lc=False,
                  seed=42,
                  bpe_size=10000,
                  dev_size=1000,
                  test_dir="test"):
    """ Create a BPE-preprocessed corpus with random train/dev/test splits. """
    source_file = "{}/jw300.{}-{}.{}".format(filedir, src, trg, src)
    target_file = "{}/jw300.{}-{}.{}".format(filedir, src, trg, trg)

    # download
    opus_reader = opustools_pkg.OpusRead(directory="JW300",
                                         source=src,
                                         target=trg,
                                         write_mode="moses",
                                         write=[source_file, target_file],
                                         suppress_prompts=True)
    opus_reader.printPairs()

    # unzip
    subprocess.Popen('gunzip JW300_latest_xml_{}-{}.xml.gz'.format(
        src, trg).split())

    # TMX file to dataframe
    source = []
    target = []
    skip_lines = []
    with open(source_file) as f:
        for i, line in enumerate(f):
            # skip sentences that are contained in the test set
            if line.strip() not in en_test_sents:
                source.append(line.strip())
            else:
                skip_lines.append(i)
    with open(target_file) as f:
        for j, line in enumerate(f):
            # only add to corpus if corresponding source was not skipped
            if j not in skip_lines:
                target.append(line.strip())
    print(
        'Loaded data and skipped {} lines since contained in test set.'.format(
            len(skip_lines)))

    df = pd.DataFrame(zip(source, target),
                      columns=['source_sentence', 'target_sentence'])

    # drop duplicate translations
    df_pp = df.drop_duplicates()

    # drop conflicting translations
    df_pp.drop_duplicates(subset='source_sentence', inplace=True)
    df_pp.drop_duplicates(subset='target_sentence', inplace=True)

    # shuffle
    df_pp = df_pp.sample(frac=1, random_state=seed).reset_index(drop=True)

    # do the split between dev/test/train and create parallel corpora
    num_dev_patterns = dev_size
    # test data is loaded from file
    # num_test_patterns = 1000

    # Lower case the corpora
    if lc:
        df_pp["source_sentence"] = df_pp["source_sentence"].str.lower()
        df_pp["target_sentence"] = df_pp["target_sentence"].str.lower()

    dev = df_pp.tail(num_dev_patterns)
    stripped = df_pp.drop(df_pp.tail(num_dev_patterns).index)

    train_src_file = "{}/train.{}-{}.{}".format(filedir, src, trg, src)
    train_trg_file = "{}/train.{}-{}.{}".format(filedir, src, trg, trg)
    dev_src_file = "{}/dev.{}-{}.{}".format(filedir, src, trg, src)
    dev_trg_file = "{}/dev.{}-{}.{}".format(filedir, src, trg, trg)
    # tests are already created
    #test_src_file = "{}/test.{}-{}.{}".format(filedir, src, trg, src)
    #test_trg_file = "{}/test.{}-{}.{}".format(filedir, src, trg, trg)

    stripped[["source_sentence"]].to_csv(train_src_file,
                                         header=False,
                                         index=False,
                                         quotechar="",
                                         quoting=csv.QUOTE_NONE,
                                         escapechar="\\",
                                         sep="§")
    stripped[["target_sentence"]].to_csv(train_trg_file,
                                         index=False,
                                         header=False,
                                         quotechar="",
                                         quoting=csv.QUOTE_NONE,
                                         escapechar="\\",
                                         sep="§")

    dev[["source_sentence"]].to_csv(dev_src_file,
                                    index=False,
                                    header=False,
                                    quotechar="",
                                    quoting=csv.QUOTE_NONE,
                                    escapechar="\\",
                                    sep="§")
    dev[["target_sentence"]].to_csv(dev_trg_file,
                                    index=False,
                                    header=False,
                                    quotechar="",
                                    quoting=csv.QUOTE_NONE,
                                    escapechar="\\",
                                    sep="§")

    #test[["source_sentence"]].to_csv(test_src_file, index=False, header=False, quotechar="", quoting=csv.QUOTE_NONE, escapechar="\\", sep="§")
    #test[["target_sentence"]].to_csv(test_trg_file, index=False, header=False, quotechar="", quoting=csv.QUOTE_NONE, escapechar="\\", sep="§")

    # train bpe (separately for src and trg)
    src_bpe_file = "{}/{}-{}.{}.bpe".format(filedir, src, trg, src)
    trg_bpe_file = "{}/{}-{}.{}.bpe".format(filedir, src, trg, trg)
    learn_bpe.learn_bpe(codecs.open(train_src_file, encoding='utf-8'),
                        codecs.open(src_bpe_file, "w", encoding='utf-8'),
                        bpe_size)
    learn_bpe.learn_bpe(codecs.open(train_trg_file, encoding='utf-8'),
                        codecs.open(trg_bpe_file, "w", encoding='utf-8'),
                        bpe_size)

    # apply bpe
    def bpe_process(inp, outp, codes):
        codes = codecs.open(codes, encoding='utf-8')
        inp = codecs.open(inp, encoding='utf-8')
        outp = codecs.open(outp, "w", encoding='utf-8')
        bpe = apply_bpe.BPE(codes)
        for line in inp:
            outp.write(bpe.process_line(line))

    for split in ["train", "dev"]:
        for side in [src, trg]:
            bpe_process(
                "{}/{}.{}-{}.{}".format(filedir, split, src, trg, side),
                "{}/{}.{}-{}.bpe.{}".format(filedir, split, src, trg, side),
                "{}/{}-{}.{}.bpe".format(filedir, src, trg, side))

    for side in [src, trg]:
        bpe_process(
            "{}/{}.{}-{}.{}".format(test_dir, "test", src, trg, side),
            "{}/{}.{}-{}.bpe.{}".format(filedir, "test", src, trg, side),
            "{}/{}-{}.{}.bpe".format(filedir, src, trg, side))
예제 #23
0
def main(args):
    corpus_path_list = args.corpus
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    model_prefix = os.path.join(args.save_dir, args.model)
    special_tokens_kv = OrderedDict()
    # unk is always required
    special_tokens_kv['unk_token'] = Vocab.UNK_TOKEN
    if not args.disable_bos:
        special_tokens_kv['bos_token'] = Vocab.BOS_TOKEN
    if not args.disable_eos:
        special_tokens_kv['eos_token'] = Vocab.EOS_TOKEN
    if not args.disable_pad:
        special_tokens_kv['pad_token'] = Vocab.PAD_TOKEN
    # split custom special tokens
    if args.model in ['yttm'] and len(args.custom_special_tokens) > 0:
        raise ValueError(
            'model {} do not support custom_special_tokens'.format(args.model))
    for custom_special_token in args.custom_special_tokens:
        kv = custom_special_token.split('=')
        if not len(kv) == 2:
            raise ValueError(
                'parameter {} has wrong format'.format(custom_special_token))
        k, v = kv[0], kv[1]
        if k in special_tokens_kv:
            raise ValueError(
                'There are overlaps between the custom special tokens and the'
                ' unk, bos, eos, pad tokens')
        special_tokens_kv[k] = v
    # hf_wordpiece must contains mask, cls and sep tokens
    # the costom defined mask,cls,sep can overwrite the default settings
    if args.model == 'hf_wordpiece':
        if 'mask_token' not in special_tokens_kv:
            special_tokens_kv['mask_token'] = Vocab.MASK_TOKEN
        if 'cls_token' not in special_tokens_kv:
            special_tokens_kv['cls_token'] = Vocab.CLS_TOKEN
        if 'sep_token' not in special_tokens_kv:
            special_tokens_kv['sep_token'] = Vocab.SEP_TOKEN
    special_tokens = list(special_tokens_kv.values())
    print('special tokens: ' + ', '.join(special_tokens))
    vocab = []
    if args.model == 'spm':
        try_import_sentencepiece()
        import sentencepiece as spm
        corpus_path = ','.join(corpus_path_list)
        script = '--input={} --model_prefix={} --vocab_size={} --character_coverage={} --input_sentence_size={}' \
                 .format(corpus_path, model_prefix, args.vocab_size, args.coverage, args.input_sentence_size)
        script += (' --unk_id=' + str(special_tokens.index(Vocab.UNK_TOKEN)))
        script += (' --bos_id=' + ('-1' if args.disable_bos else str(
            special_tokens.index(Vocab.BOS_TOKEN))))
        script += (' --eos_id=' + ('-1' if args.disable_eos else str(
            special_tokens.index(Vocab.EOS_TOKEN))))
        script += (' --pad_id=' + ('-1' if args.disable_pad else str(
            special_tokens.index(Vocab.PAD_TOKEN))))
        if len(args.custom_special_tokens) > 0:
            ids_in_script = script.count('_id')
            script += (' --control_symbols=' +
                       ','.join(special_tokens[ids_in_script:]))
        print(script)
        spm.SentencePieceTrainer.Train(script)
        if 'bos_token' in special_tokens_kv:
            special_tokens_kv['bos_token'] = '<s>'
        if 'eos_token' in special_tokens_kv:
            special_tokens_kv['eos_token'] = '</s>'
        # build spm vocab
        spm_model = spm.SentencePieceProcessor()
        spm_model.load(model_prefix + '.model')
        vocab = [spm_model.id_to_piece(i) for i in range(len(spm_model))]
        os.remove(model_prefix + '.vocab')
    elif args.model == 'subword_nmt':
        try_import_subword_nmt()
        from subword_nmt import learn_bpe
        corpus_path = cat_corpus(corpus_path_list)\
            if len(corpus_path_list) > 1 else corpus_path_list[0]
        # build model
        with open(corpus_path, 'r', encoding='utf-8') as fc,\
             open(model_prefix + '.model', 'w', encoding='utf-8') as fm:
            learn_bpe.learn_bpe(fc,
                                fm,
                                args.vocab_size - len(special_tokens),
                                total_symbols=True)
        # build vocab
        with open(corpus_path, 'r', encoding='utf-8') as fc, \
             open(model_prefix + '.model', 'r', encoding='utf-8') as fm:
            vocab.extend(special_tokens)
            uniq_chars_internal = set()
            uniq_chars_final = set()
            uniq_words = set()
            for line in fc:
                for word in line.strip('\r\n ').split(' '):
                    if word:
                        uniq_words.add(word)
            # this code piece is same as
            # https://github.com/rsennrich/subword-nmt/blob/master/subword_nmt/learn_bpe.py shows
            uniq_words = [
                tuple(x[:-1]) + (x[-1] + '</w>', ) for x in uniq_words
            ]
            for word in uniq_words:
                for char in word[:-1]:
                    uniq_chars_internal.add(char)
                uniq_chars_final.add(word[-1])
            # sort to ensure the same settings produce the same vocab
            vocab.extend(sorted(list(uniq_chars_internal)))
            vocab.extend(sorted(list(uniq_chars_final)))
            fm.readline()
            pair = fm.readline()
            while (pair):
                vocab.append(pair.replace(' ', '', 1).strip())
                pair = fm.readline()
        if len(corpus_path_list) > 1:
            os.remove(corpus_path)
    elif args.model == 'yttm':
        try_import_yttm()
        import youtokentome as yttm
        corpus_path = cat_corpus(corpus_path_list)\
            if len(corpus_path_list) > 1 else corpus_path_list[0]
        tokenizer = yttm.BPE.train(
            data=corpus_path,
            model=model_prefix + '.model',
            vocab_size=args.vocab_size,
            coverage=args.coverage,
            n_threads=args.n_threads,
            unk_id=special_tokens.index(Vocab.UNK_TOKEN),
            bos_id=-1
            if args.disable_bos else special_tokens.index(Vocab.BOS_TOKEN),
            eos_id=-1
            if args.disable_eos else special_tokens.index(Vocab.EOS_TOKEN),
            pad_id=-1
            if args.disable_pad else special_tokens.index(Vocab.PAD_TOKEN))
        vocab = tokenizer.vocab()
        if 'unk_token' in special_tokens_kv:
            special_tokens_kv['unk_token'] = '<UNK>'
        if 'bos_token' in special_tokens_kv:
            special_tokens_kv['bos_token'] = '<BOS>'
        if 'eos_token' in special_tokens_kv:
            special_tokens_kv['eos_token'] = '<EOS>'
        if 'pad_token' in special_tokens_kv:
            special_tokens_kv['pad_token'] = '<PAD>'
        if len(corpus_path_list) > 1:
            os.remove(corpus_path)
    elif args.model in ['hf_bpe', 'hf_bytebpe', 'hf_wordpiece']:
        tokenizers = try_import_huggingface_tokenizers()
        if args.model == 'hf_bpe':
            tokenizer = tokenizers.CharBPETokenizer(lowercase=args.lowercase)
        elif args.model == 'hf_bytebpe':
            tokenizer = tokenizers.ByteLevelBPETokenizer(
                lowercase=args.lowercase)
        elif args.model == 'hf_wordpiece':
            tokenizer = tokenizers.BertWordPieceTokenizer(
                lowercase=args.lowercase, strip_accents=args.strip_accents)
        else:
            raise NotImplementedError
        tokenizer.train(corpus_path_list,
                        vocab_size=args.vocab_size,
                        show_progress=True,
                        special_tokens=special_tokens)
        tokenizer.save(args.save_dir, args.model)
        # we replace the huggingface vocab file with our Vocab implementation
        if args.model == 'hf_wordpiece':
            hf_vocab_file = model_prefix + '-vocab.txt'
            with open(hf_vocab_file, 'r', encoding='utf-8') as fv:
                for line in fv:
                    vocab.append(line.strip())
        else:
            # Move the hf_${model}-merges.txt to hf_${model}.models
            os.rename(
                os.path.join(args.save_dir,
                             '{}-merges.txt'.format(args.model)),
                os.path.join(args.save_dir, '{}.model'.format(args.model)))
            hf_vocab_file = model_prefix + '-vocab.json'
            with open(hf_vocab_file, 'r', encoding='utf-8') as fv:
                vocab_kv = json.load(fv)
                vocab_kv = sorted(list(vocab_kv.items()), key=lambda x: x[1])
                for kv in vocab_kv:
                    vocab.append(kv[0])
        os.remove(hf_vocab_file)
    else:
        raise NotImplementedError
    unk_token = special_tokens_kv.pop('unk_token')
    vocab_obj = Vocab(vocab, unk_token=unk_token, **special_tokens_kv)
    vocab_obj.save(model_prefix + '.vocab')
예제 #24
0
def main():
    options = parse_args()
    torch.manual_seed(options.seed)
    basename = os.path.splitext(os.path.basename(options.input))[0]
    out_dir = options.out_dir or "data/{}/".format(basename)
    spinner = Halo(spinner="dots", placement="right")

    with open(options.input, "r", encoding="utf8") as fd:
        reader = csv.reader(fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="")
        lines = [[line[0]] for line in reader]

    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    output_full = os.path.join(out_dir, "{}.tsv".format(basename))
    with open(output_full, "w", encoding="utf8") as fd:
        writer = csv.writer(fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar="")
        writer.writerows(lines)

    vocab_size = 32000
    spiece_out = os.path.join(out_dir, "spiece")
    spiece_args = (
        "--input={} "
        "--model_prefix={} "
        "--vocab_size={} "
        "--character_coverage=1.0"
    ).format(output_full, spiece_out, vocab_size)
    SentencePieceTrainer.Train(spiece_args)
    # Load the generated vocabulary
    with open("{}.vocab".format(spiece_out), "r", encoding="utf8") as fd:
        reader = csv.reader(
            fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar=""
        )
        vocab = [line[0] for line in reader]
    # Remove the special tokens <unk>, <s>, </s>
    vocab = vocab[3:]

    # Convert to BERT style
    bert_vocab = [
        v[1:] if v.startswith("▁") else "##{}".format(v) for v in vocab if v != "▁"
    ]
    # Add BERT's special tokens to the beginning
    bert_vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + bert_vocab
    # Fill up with unused tokens
    pad_size = vocab_size - len(bert_vocab)
    bert_vocab += ["unused{}".format(i) for i in range(pad_size)]
    with open(os.path.join(out_dir, "vocab.txt"), "w", encoding="utf8") as fd:
        writer = csv.writer(
            fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar=""
        )
        writer.writerows([[b] for b in bert_vocab])

    # Convert to GPT-2 style
    # Unfortunately it's slow and tedious.
    spinner.start(text="Generating BPE vocabulary")
    gpt2_vocab = ["Ġ{}".format(v[1:]) if v.startswith("▁") else v for v in vocab]
    # Add the GPT-2 special token to the end
    gpt2_vocab.append("<|endoftext|>")
    with open(os.path.join(out_dir, "vocab.json"), "w", encoding="utf8") as fd:
        json.dump({v: i for i, v in enumerate(gpt2_vocab)}, fd, ensure_ascii=False)
    spiece_processor = SentencePieceProcessor()
    spiece_processor.Load("{}.model".format(spiece_out))
    # Encode the whole text
    encoded = [
        [" ".join(spiece_processor.EncodeAsPieces(line[0])).replace("▁", "Ġ")]
        for line in lines
    ]
    tmp_encoded_fd, tmp_encoded_path = tempfile.mkstemp()
    tmp_bpe_fd, tmp_bpe_path = tempfile.mkstemp()
    try:
        # Write the encoded text to a temporary file.
        with os.fdopen(tmp_encoded_fd, "w", encoding="utf8") as fd:
            writer = csv.writer(
                fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar=""
            )
            writer.writerows(encoded)
        learn_bpe(
            open(tmp_encoded_path, "r", encoding="utf8"),
            open(tmp_bpe_path, "w", encoding="utf8"),
            num_symbols=vocab_size,
        )
        with open(tmp_bpe_path, "r", encoding="utf8") as fd:
            reader = csv.reader(
                fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar=""
            )
            seen = set()
            merges = []
            for line in reader:
                # Get rid of the </w> tokens
                line = line[0].replace("</w>", "")
                # Remove duplicates (due to </w> tokens)
                if line not in seen:
                    seen.add(line)
                    merges.append([line])
        with open(os.path.join(out_dir, "merges.txt"), "w", encoding="utf8") as fd:
            writer = csv.writer(
                fd, delimiter="\t", quoting=csv.QUOTE_NONE, quotechar=""
            )
            writer.writerows(merges)
    finally:
        os.remove(tmp_encoded_path)
        os.remove(tmp_bpe_path)
    spinner.stop()