def main():
    args = cmd_args()
    outdir = args.o if args.o else os.path.dirname(args.i)

    target_special_tokens, subtoken_special_tokens = get_special_tokens(
        args.preset)
    with tempfile.TemporaryDirectory() as tmp_dir:
        targets_file = os.path.join(tmp_dir, "labels.txt")
        subtokens_file = os.path.join(tmp_dir, "subtokens.txt")

        print(f"Creating training files for BPE")
        create_bpe_training_file(args.i, targets_file, subtokens_file)
        if args.preset == Preset.variable:
            print("Variable preset")

        subtoken_tokenizer = SentencePieceBPETokenizer()
        target_tokenizer = SentencePieceBPETokenizer()
        print(f"Training subtoken tokenizer")
        subtoken_tokenizer.add_special_tokens(subtoken_special_tokens)
        print(f"Training target tokenizer")
        target_tokenizer.add_special_tokens(target_special_tokens)

        target_tokenizer.train(files=[targets_file],
                               vocab_size=args.target_vocab)
        subtoken_tokenizer.train(files=[subtokens_file],
                                 vocab_size=args.subtoken_vocab)

    target_tokenizer.save(outdir, "target.bpe")
    subtoken_tokenizer.save(outdir, "subtoken.bpe")
    def __init__(self, args: Namespace):
        super().__init__()

        self.target_encoder = SentencePieceBPETokenizer(
            args.target_vocab, args.target_merges)
        self.subtoken_encoder = SentencePieceBPETokenizer(
            args.subtoken_vocab, args.subtoken_merges)
        # self.target_encoder.add_special_tokens(
        #     [self.EOS_TOKEN, self.SOS_TOKEN, self.PAD_TOKEN]
        # )
        # self.subtoken_encoder.add_special_tokens([self.EOS_TOKEN, self.PAD_TOKEN])

        with open(args.node_dict, "rb") as f:
            self.node_to_index = pickle.load(f)
            self.index_to_node = {v: k for k, v in self.node_to_index.items()}
Пример #3
0
    def test_train_from_iterator(self):
        text = ["A first sentence", "Another sentence", "And a last one"]
        tokenizer = SentencePieceBPETokenizer()
        tokenizer.train_from_iterator(text, show_progress=False)

        output = tokenizer.encode("A sentence")
        assert output.tokens == ["▁A", "▁sentence"]
Пример #4
0
    def load(vocab_file=None):
        if not os.path.exists(vocab_file):
            raise Exception("{} is not exist".format(vocab_file))
        path, filename = os.path.split(vocab_file)
        ttype = filename.split("_")[0]
        merges_file = os.path.join(
            path, filename.replace("vocab.json", "merges.txt"))
        if ttype == "byte":
            if not os.path.exists(merges_file):
                raise Exception("{} is not exist".format(merges_file))
            tokenizer = ByteLevelBPETokenizer(
                add_prefix_space=True,  # required
                lowercase=True,  # required
                unicode_normalizer=None,  # required
                vocab_file=vocab_file,
                merges_file=merges_file,
                dropout=None,
                continuing_subword_prefix=None,
                end_of_word_suffix=None)

        elif ttype == "char":
            if not os.path.exists(merges_file):
                raise Exception("{} is not exist".format(merges_file))
            tokenizer = CharBPETokenizer(
                unk_token=unk_token,  # required
                suffix=suffix_token,  # required
                lowercase=True,  # required
                unicode_normalizer=None,  # required
                vocab_file=vocab_file,
                merges_file=merges_file,
                dropout=None)

        elif ttype == "bert":
            tokenizer = BertWordPieceTokenizer(
                clean_text=True,  # required
                handle_chinese_chars=True,  # required
                strip_accents=True,  # required
                lowercase=True,  # required
                vocab_file=vocab_file,
                # add_special_tokens=True,
                unk_token=BUNK,
                sep_token=BSEP,
                cls_token=BCLS,
                wordpieces_prefix=BPRE)

        elif ttype == "sent":
            if not os.path.exists(merges_file):
                raise Exception("{} is not exist".format(merges_file))
            tokenizer = SentencePieceBPETokenizer(
                add_prefix_space=True,  # required
                unk_token=unk_token,
                replacement=rep_token,
                vocab_file=vocab_file,
                merges_file=merges_file,
                dropout=None)

        else:
            raise Exception("Not implement yet")

        return tokenizer
 def train_tokenizer(self, paths: List[str], vocab_size: int,
                     to_save_dir: str, languages: Dict[str, int]):
     self.tokenizer = SentencePieceBPETokenizer()
     self.init_properties(languages)
     self.tokenizer.train(files=paths,
                          vocab_size=vocab_size,
                          min_frequency=5,
                          special_tokens=self.special_tokens)
     self.save(directory=to_save_dir)
Пример #6
0
 def __init__(self, dataset_folder, tokenizer_method):
     self.dataset_folder = dataset_folder
     self.tokenizer_method = tokenizer_method
     if tokenizer_method == "sentencepiece":
         self.tokenizer = SentencePieceBPETokenizer(
             "./data/sentencepiece_tokenizer/vocab.json",
             "./data/sentencepiece_tokenizer/merges.txt")
     elif tokenizer_method == "bert":
         self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def main():
    #argparser
    parser = argparse.ArgumentParser(
        prog="train_mlm_camembert_thai.py",
        description="train mlm for Camembert with huggingface Trainer",
    )

    #required
    parser.add_argument("--bpe_tokenizer",
                        type=str,
                        default='sentencepiece',
                        help='Specify the name of BPE Tokenizer')
    parser.add_argument("--vocab_size", type=int, default=52000)
    parser.add_argument("--min_frequency", type=int, default=2)
    parser.add_argument(
        "--train_dir",
        type=str,
    )
    parser.add_argument(
        "--output_dir",
        type=str,
    )
    parser.add_argument("--ext", type=str, default='.txt')

    args = parser.parse_args()

    fnames = [str(x) for x in glob.glob(f"{args.train_dir}/*{args.ext}")]

    # Initialize a tokenizer
    if args.bpe_tokenizer == 'byte_level':
        _BPE_TOKENIZER = ByteLevelBPETokenizer()
    if args.bpe_tokenizer == 'char':
        _BPE_TOKENIZER = CharBPETokenizer()
    if args.bpe_tokenizer == 'sentencepiece':
        _BPE_TOKENIZER = SentencePieceBPETokenizer()

    tokenizer = _BPE_TOKENIZER

    # Customize training
    tokenizer.train(files=fnames,
                    vocab_size=args.vocab_size,
                    min_frequency=args.min_frequency,
                    special_tokens=[
                        "<s>",
                        "<pad>",
                        "</s>",
                        "<unk>",
                        "<mask>",
                    ])

    # Save files to disk
    tokenizer.save_model(args.output_dir)

    #test
    tokenizer = CamembertTokenizer.from_pretrained(args.output_dir)
    print(tokenizer.encode_plus('สวัสดีครับ hello world'))
Пример #8
0
def get_default_tokenizer():
    from tokenizers import SentencePieceBPETokenizer

    tokenizer = SentencePieceBPETokenizer(path.join(VOCAB_PATH,
                                                    'en-vocab.json'),
                                          path.join(VOCAB_PATH,
                                                    'en-merges.txt'),
                                          unk_token='[UNK]')

    return tokenizer
 def __init__(self, tok_model_path: Optional[str] = None):
     self.languages = {}
     if tok_model_path is not None:
         self.tokenizer = SentencePieceBPETokenizer(
             tok_model_path + "/vocab.json",
             tok_model_path + "/merges.txt",
         )
         with open(os.path.join(tok_model_path, "langs"), "rb") as fp:
             self.languages: Dict[str, int] = pickle.load(fp)
     self.init_properties(self.languages)
    def __init__(self, max_meta_len: int, max_body_len: int,
                 ignore_meta_prob: float):
        tokenizer = SentencePieceBPETokenizer(vocab_file=str(_VOCAB),
                                              merges_file=str(_MERGES))

        super().__init__(tokenizer=tokenizer,
                         max_meta_len=max_meta_len,
                         max_body_len=max_body_len,
                         ignore_meta_prob=ignore_meta_prob,
                         pad_token='<pad>')
Пример #11
0
 def __init__(self, path, max_tokens):
     self.logger = log.getLogger("Tokenizer")
     self.logger.info("loading tokenizer")
     self.logger.info("path: " + path)
     self.logger.info("max_tokens: " + str(max_tokens))
     self.tokenizer = SentencePieceBPETokenizer(
         os.path.join(path, "vocab.json"), os.path.join(path, "merges.txt"))
     self.max_tokens = max_tokens
     self.idx = {}
     for s in ['</s>', '<s>', '<pad>']:
         self.idx[s] = self.tokenizer.token_to_id(s)
Пример #12
0
 def fit_on_text(self, text):
     if self.lower:
         text = text.lower()
     words = text.split()
     tokenizer1 = SentencePieceBPETokenizer(vocab, merges)
     for word in words:
         for sub_word in tokenizer1.encode(word).tokens:
             if sub_word not in self.word2idx:
                 self.word2idx[sub_word] = self.idx
                 self.idx2word[self.idx] = sub_word
                 self.idx += 1
Пример #13
0
 def train(corpus_list, vocab_size, output, output_name=None):
     print("create tokenizer...")
     tokenizer = SentencePieceBPETokenizer()
     print("load corpus list...")
     corpus_list = open(corpus_list).read().split('\n')[:-1]
     print("train tokenizer...")
     tokenizer.train(
         corpus_list,
         vocab_size=vocab_size,
         special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
     print("save model...")
     tokenizer.save_model(output, output_name)
Пример #14
0
 def configure(self):
     if isinstance(SentencePieceBPETokenizer, UnsupportedPackage):
         SentencePieceBPETokenizer.raise_error(self.__provider__)
     self.tokenizer = SentencePieceBPETokenizer(
         str(self.get_value_from_config('vocabulary_file')),
         str(self.get_value_from_config('merges_file')))
     self.add_extra_symbols = self.get_value_from_config(
         'add_extra_symbols')
     self.idx = {}
     for s in ['sos', 'eos']:
         self.idx[s] = self.tokenizer.token_to_id(
             str(self.get_value_from_config(s + '_symbol')))
Пример #15
0
 def configure(self):
     if isinstance(SentencePieceBPETokenizer, UnsupportedPackage):
         SentencePieceBPETokenizer.raise_error(self.__provider__)
     self.tokenizer = SentencePieceBPETokenizer(
         str(self.get_value_from_config('vocabulary_file')),
         str(self.get_value_from_config('merges_file')))
     self.remove_extra_symbols = self.get_value_from_config(
         'remove_extra_symbols')
     self.idx = {}
     for s in ['sos', 'eos', 'pad']:
         self.idx[s] = str(self.get_value_from_config(s + '_symbol'))
     self.output_name = self.get_value_from_config('output_name')
     self.output_checked = False
Пример #16
0
 def __init__(self, path, max_tokens):
     self.logger = log.getLogger("Tokenizer")
     self.logger.info("loading tokenizer")
     self.logger.info(f"path: {path}")
     self.logger.info(f"max_tokens: {max_tokens}")
     self.tokenizer = SentencePieceBPETokenizer(
         str(path / "vocab.json"),
         str(path / "merges.txt"),
     )
     self.max_tokens = max_tokens
     self.idx = {}
     for s in ['</s>', '<s>', '<pad>']:
         self.idx[s] = self.tokenizer.token_to_id(s)
    def __init__(self, tok_type, unk_token, sep_token, cls_token, pad_token,
                 mask_token):
        self.tok_type = tok_type

        if self.tok_type == 'bpe':
            self.tokenizer = ByteLevelBPETokenizer()
        elif self.tok_type == 'wordpiece':
            self.tokenizer = BertWordPieceTokenizer(unk_token=unk_token,
                                                    sep_token=sep_token,
                                                    cls_token=cls_token,
                                                    pad_token=pad_token,
                                                    mask_token=mask_token)
        elif self.tok_type == 'sentencepiece':
            self.tokenizer = SentencePieceBPETokenizer(unk_token=unk_token)
Пример #18
0
    def set_tokenizer(self):

        if self.storage_method == "raw":
            pass  # Essentially keep it None. Important for exceptions
        elif self.storage_method == "bert":
            self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        elif self.storage_method == "roberta":
            self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
        elif self.storage_method == "token":
            self.tokenizer = SentencePieceBPETokenizer(
                os.path.join(self.tokenizer_path, "/vocab.json"),
                os.path.join(self.tokenizer_path, "merges.txt"))
        else:
            raise ValueError("Unknown storage method encountered!")
Пример #19
0
    def __init__(self, args):
        self.args = args
        if self.args.type == "byte":
            self.tokenizer = ByteLevelBPETokenizer(
                add_prefix_space=True,  # required
                lowercase=True,  # required
                unicode_normalizer=None,  # required
                vocab_file=None,
                merges_file=None,
                dropout=None,
                continuing_subword_prefix=None,
                end_of_word_suffix=None)

        elif self.args.type == "char":
            self.tokenizer = CharBPETokenizer(
                unk_token=unk_token,  # required
                suffix=suffix_token,  # required
                lowercase=True,  # required
                unicode_normalizer=None,  # required
                vocab_file=None,
                merges_file=None,
                dropout=None)

        elif self.args.type == "bert":
            self.tokenizer = BertWordPieceTokenizer(
                clean_text=True,  # required
                handle_chinese_chars=True,  # required
                strip_accents=True,  # required
                lowercase=True,  # required
                vocab_file=None,
                # add_special_tokens=True,
                unk_token=BUNK,
                sep_token=BSEP,
                cls_token=BCLS,
                wordpieces_prefix=BPRE)

        elif self.args.type == "sent":
            self.tokenizer = SentencePieceBPETokenizer(
                add_prefix_space=True,  # required
                unk_token=unk_token,
                replacement=rep_token,
                vocab_file=None,
                merges_file=None,
                dropout=None)

        else:
            raise Exception("Not implement yet")

        pass
Пример #20
0
def main():
    args = cmd_args()
    outdir = args.o if args.o else os.path.dirname(args.i)

    print(
        f"Training SentencePiece to create a vocabulary of size {args.vocab_size}"
    )
    with tempfile.TemporaryDirectory() as tmp_dir:
        train_file = os.path.join(tmp_dir, "train.txt")
        create_bpe_training_file(args.i, train_file)

        tokenizer = SentencePieceBPETokenizer()
        tokenizer.train(files=[train_file], vocab_size=args.vocab_size)

    tokenizer.save(outdir, args.n)
Пример #21
0
 def load_tokenizer(path,
                    enable_truncation=True,
                    enable_padding=True,
                    max_length=512):
     tokenizer = SentencePieceBPETokenizer(os.path.join(path, "vocab.json"),
                                           os.path.join(path, "merges.txt"))
     tokenizer._tokenizer.post_processor = BertProcessing(
         ("</s>", tokenizer.token_to_id("</s>")),
         ("<s>", tokenizer.token_to_id("<s>")),
     )
     if enable_truncation:
         tokenizer.enable_truncation(max_length=max_length)
     if enable_padding:
         tokenizer.enable_padding(pad_token="<pad>",
                                  pad_id=tokenizer.token_to_id("<pad>"))
     return tokenizer
Пример #22
0
def load_tokenizer(langpair: str) -> SentencePieceBPETokenizer:
    if langpair in ["en-de", "de-en", "ende", "deen", "ENDE", "EN-DE"]:
        langpair = "deen"

    tokenizer_dir = Path(__file__).parent.parent / "src" / "tokenizer"
    vocab_filepath = (
        tokenizer_dir / f"sentencepiece_bpe_wmt14_{langpair}.tokenizer-vocab.json"
    )
    merges_filepath = (
        tokenizer_dir / f"sentencepiece_bpe_wmt14_{langpair}.tokenizer-merges.txt"
    )

    tokenizer = SentencePieceBPETokenizer(
        vocab_file=str(vocab_filepath),
        merges_file=str(merges_filepath),
    )
    return tokenizer
Пример #23
0
def build_bpe(vocab_size=10000):
    # Initialize a tokenizer
    tokenizer = SentencePieceBPETokenizer()

    #mypath = "../../Downloads/riksdagens_protokoll_1920-2020/annual"
    mypath = "../../Desktop/cood/python/machine-learning/old-school/markov-lstm-killer/data/fi"
    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
    print("ONL", onlyfiles)

    paths = [mypath + "/" + f for f in onlyfiles]

    #paths = paths[:5]

    # COPY FILES
    txts = []
    for path, fname in zip(paths, onlyfiles):
        if path[-4:] == ".txt":
            localpath = "data/" + fname
            txts.append(localpath)

            infile = open(path)
            outfile = open(localpath, "w")

            for line in infile:
                clean_line = cleanup(line) + "\n"
                outfile.write(clean_line)

            outfile.close()

    # Then train it!
    #tokenizer.train([ "../../Downloads/riksdagens_protokoll_1920-2020/annual/prot_2019.txt" ], vocab_size=15000)
    tokenizer.train(txts, vocab_size=vocab_size)

    # Now, let's use it:
    s = "Det politiska arbetet har redan börjat på olika sätt, med resor, besök, möten, politikutveckling, motionsskrivande och mycket annat. Jag har sett att ni redan har varit aktiva under ett antal veckor, och jag kan försäkra er att det även gäller talmanspresidiet. Nu är det dags att med tillförsikt påbörja ett nytt riksdagsår. Jag hoppas att ni alla ser fram emot det lika myck­et som jag gör."
    #s = "Ite en oo viel mitää hyvää kyl sielt syöny."
    #s = "ja kieltämät siihe tommoste kokonaisii sanoi merkitsevät tavumerkit on huomattavasti näppärämpii ku ääniä tarkottavat aakkoset joist pitää rakentaa jokane sana"
    encoded = tokenizer.encode(s)

    print(encoded.ids)
    print(encoded.tokens)
    # And finally save it somewhere
    tokenizer.save("./bpe-fi.tokenizer.json")
Пример #24
0
def train_kenlm_language_model(input_data_paths, output_model_dir):
    output_model_dir = Path(output_model_dir)
    output_model_dir.mkdir(exist_ok=True, parents=True)
    output_model_path = output_model_dir / 'kenlm_model.arpa'
    with log_action('Training tokenizer'):
        tokenizer = SentencePieceBPETokenizer()
        tokenizer.train([str(path) for path in input_data_paths], vocab_size=20000)
        tokenizer.save(str(output_model_dir), 'spm_tokenizer')
    with log_action('Tokenizing'):
        tokenized_data_paths = get_temp_filepaths(len(input_data_paths))
        for tokenized_data_path, input_data_path in zip(tokenized_data_paths, input_data_paths):
            encodings = tokenizer.encode_batch(read_lines(input_data_path))
            write_lines([' '.join(encoding.tokens) for encoding in encodings], tokenized_data_path)
    with log_action('Training language model'):
        kenlm_path = input('Please provide the path to the lmplz script (install at https://github.com/kpu/kenlm): ')
        command = (
            f'cat {" ".join([str(path) for path in tokenized_data_paths])} | {kenlm_path} -o 3 > {output_model_path}'
        )
        run_command(command, mute=False)
    [path.unlink() for path in tokenized_data_paths]
    return output_model_dir
 def __init__(self,
              vocab_file,
              merges_file,
              unk_token="<unk>",
              bos_token="<s>",
              eos_token="</s>",
              pad_token="<pad>",
              add_prefix_space=False,
              **kwargs):
     super().__init__(
         SentencePieceBPETokenizer(
             vocab_file=vocab_file,
             merges_file=merges_file,
             add_prefix_space=add_prefix_space,
         ),
         bos_token=bos_token,
         eos_token=eos_token,
         unk_token=unk_token,
         pad_token=pad_token,
         **kwargs,
     )
Пример #26
0
    def __init__(self, fname, tokenizer,dat_fname):
        if os.path.exists(dat_fname):
            print('loading dataset:', dat_fname)
            self.data = pickle.load(open(dat_fname, 'rb'))

        else:
            tokenizer1 = SentencePieceBPETokenizer(vocab, merges)
            reader = pd.read_excel(fname)
            all_data = []
            for i in range(reader.shape[0]):

                text_raw1=[]
                text_raw2=[]
                column_name1 = tokenizer1.encode(reader.iloc[i][0].lower().strip()).tokens
                [text_raw1.extend(tokenizer1.encode(x).tokens) for x in reader.iloc[i][2].lower().strip().split(' ')]
                column_name2 = tokenizer1.encode(reader.iloc[i][1].lower().strip()).tokens
                [text_raw2.extend(tokenizer1.encode(x).tokens) for x in reader.iloc[i][3].lower().strip().split(' ')]
                class_n = reader.iloc[i][4]

                text_raw_indices1 = tokenizer.text_to_sequence(text_raw1)
                aspect_indices1 = tokenizer.text_to_sequence(column_name1)
                text_raw_indices2 = tokenizer.text_to_sequence(text_raw2)
                aspect_indices2 = tokenizer.text_to_sequence(column_name2)
                data = {
                    'text_raw_indices1': text_raw_indices1,
                    'aspect_indices1': aspect_indices1,
                    'text_raw_indices2': text_raw_indices2,
                    'aspect_indices2': aspect_indices2,
                    'class_n': int(class_n),
                }
                all_data.append(data)
            self.data = all_data

            pickle.dump(self.data, open(dat_fname, 'wb'))

            print("Finished write data file")
Пример #27
0
    STORAGE_BUCKET = "gs://sbt0"

    # for prefix in prefixes:
    #     input_dir_gs = os.path.join(
    #         STORAGE_BUCKET,
    #         "data/corpus/%s_lower/zhwiki-latest-pages-articles_%s_lower.txt" % (prefix, prefix)
    #     )
    #     input_dir_local = "./zhwiki-latest-pages-articles_%s_lower.txt" % prefix
    #     tf.gfile.Copy(input_dir_gs, input_dir_local, overwrite=True)

    for vocab_size in vocab_sizes:
        for prefix in prefixes:
            try:
                tokenizer_name = prefix + "_" + str(vocab_size)
                tokenizer = SentencePieceBPETokenizer()

                tokenizer.train(
                    [
                        "./zhwiki-latest-pages-articles_%s_lower.txt" % prefix
                        # "./zhwiki-latest-pages-articles_lower.txt"
                    ],
                    vocab_size=vocab_size,
                    show_progress=True,
                    min_frequency=1,
                    special_tokens=[
                        "<unk>", "[SEP]", "[CLS]", "[PAD]", "[MASK]"
                    ])
                tokenizer.save("data_proc/tokenizers/sentencepiece",
                               tokenizer_name)
Пример #28
0
    data = load_both(args.reference, args.nbest)

    processed_ids = set()
    if args.c is not None:
        with jsonlines.open(args.c) as reader:
            for obj in reader:
                processed_ids.add(obj['id'])

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    model = AutoModelForMaskedLM.from_pretrained(model_name)

    if args.opi:
        from tokenizers import SentencePieceBPETokenizer
        from tokenizers.processors import RobertaProcessing

        tokenizer = SentencePieceBPETokenizer(f"{args.model}/vocab.json", f"{args.model}/merges.txt")
        getattr(tokenizer, "_tokenizer").post_processor = RobertaProcessing(sep=("</s>", 2), cls=("<s>", 0))
        tokenizer.mask_token_id = model.roberta.embeddings.word_embeddings.weight.shape[0] - 1  # last is mask?

    model.eval()

    if device == 'cuda':
        if args.half:
            model.half()
        model.to(device)

    start = time.time()
    count = 0
    with jsonlines.open(args.output, mode='w', flush=True) as writer:
        for id, utt in tqdm.tqdm(data.items(), desc="Texts"):
            if id in processed_ids: continue
import argparse
from tokenizers import SentencePieceBPETokenizer
from tokenizers.trainers import BpeTrainer

parser = argparse.ArgumentParser()

parser.add_argument("--corpus_file", type=str)
parser.add_argument("--vocab_size", type=int, default=32000)
parser.add_argument("--limit_alphabet", type=int, default=6000)

args = parser.parse_args()

tokenizer = SentencePieceBPETokenizer(
    vocab_file=None,
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=False,  # Must be False if cased model
    lowercase=False,
    wordpieces_prefix="##")

tokenizer.train(files=[args.corpus_file],
                limit_alphabet=args.limit_alphabet,
                vocab_size=args.vocab_size)

tokenizer.save("./", "ch-{}-wpm-{}".format(args.limit_alphabet,
                                           args.vocab_size))
Пример #30
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
    )
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected in the list: " +
        ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS),
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--labels",
        default="",
        type=str,
        help=
        "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
    )
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_predict",
                        action="store_true",
                        help="Whether to run predictions on the test set.")
    parser.add_argument(
        "--evaluate_during_training",
        action="store_true",
        help="Whether to run evaluation during training at each logging step.",
    )
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--keep_accents",
                        action="store_const",
                        const=True,
                        help="Set this flag if model is trained with accents.")
    parser.add_argument(
        "--strip_accents",
        action="store_const",
        const=True,
        help="Set this flag if model is trained without accents.")
    parser.add_argument("--use_fast",
                        action="store_const",
                        const=True,
                        help="Set this flag to use fast tokenization.")
    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps",
                        type=int,
                        default=500,
                        help="Log every X updates steps.")
    parser.add_argument("--save_steps",
                        type=int,
                        default=500,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="For distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="For distant debugging.")
    args = parser.parse_args()

    print(args.model_name_or_path)

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Prepare CONLL-2003 task
    labels = get_labels(args.labels)
    num_labels = len(labels)
    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        id2label={str(i): label
                  for i, label in enumerate(labels)},
        label2id={label: i
                  for i, label in enumerate(labels)},
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    tokenizer_args = {
        k: v
        for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS
    }
    logger.info("Tokenizer arguments: %s", tokenizer_args)

    base_dir = r'/net/people/plgpgajdzica/scratch/ner/data/embeddings/bert/polish_roberta'
    tokenizer = SentencePieceBPETokenizer(os.path.join(base_dir, "vocab.json"),
                                          os.path.join(base_dir, "merges.txt"))
    tokenizer.enable_padding(pad_token="<pad>", pad_id=1, max_length=128)
    getattr(tokenizer,
            "_tokenizer").post_processor = RobertaProcessing(sep=("</s>", 2),
                                                             cls=("<s>", 0))

    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                tokenizer,
                                                labels,
                                                pad_token_label_id,
                                                mode="train")
        global_step, tr_loss = train(args, train_dataset, model, tokenizer,
                                     labels, pad_token_label_id)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (model.module if hasattr(model, "module") else model
                         )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        # tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(args.output_dir,
                                                    **tokenizer_args)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                "-")[-1] if len(checkpoints) > 1 else ""
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result, _ = evaluate(args,
                                 model,
                                 tokenizer,
                                 labels,
                                 pad_token_label_id,
                                 mode="dev",
                                 prefix=global_step)
            if global_step:
                result = {
                    "{}_{}".format(global_step, k): v
                    for k, v in result.items()
                }
            results.update(result)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            for key in sorted(results.keys()):
                writer.write("{} = {}\n".format(key, str(results[key])))

    if args.do_predict and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(args.output_dir,
                                                    **tokenizer_args)
        model = model_class.from_pretrained(args.output_dir)
        model.to(args.device)
        result, predictions = evaluate(args,
                                       model,
                                       tokenizer,
                                       labels,
                                       pad_token_label_id,
                                       mode="test")
        # Save results
        output_test_results_file = os.path.join(args.output_dir,
                                                "test_results.txt")
        with open(output_test_results_file, "w") as writer:
            for key in sorted(result.keys()):
                writer.write("{} = {}\n".format(key, str(result[key])))
        # Save predictions
        output_test_predictions_file = os.path.join(args.output_dir,
                                                    "test_predictions.txt")
        with open(output_test_predictions_file, "w") as writer:
            with open(os.path.join(args.data_dir, "test.txt"), "r") as f:
                example_id = 0
                for line in f:
                    if line.startswith(
                            "-DOCSTART-") or line == "" or line == "\n":
                        writer.write(line)
                        if not predictions[example_id]:
                            example_id += 1
                    elif predictions[example_id]:
                        output_line = line.split(
                        )[0] + " " + predictions[example_id].pop(0) + "\n"
                        writer.write(output_line)
                    else:
                        logger.warning(
                            "Maximum sequence length exceeded: No prediction for '%s'.",
                            line.split()[0])

    return results