def main(): args = cmd_args() outdir = args.o if args.o else os.path.dirname(args.i) target_special_tokens, subtoken_special_tokens = get_special_tokens( args.preset) with tempfile.TemporaryDirectory() as tmp_dir: targets_file = os.path.join(tmp_dir, "labels.txt") subtokens_file = os.path.join(tmp_dir, "subtokens.txt") print(f"Creating training files for BPE") create_bpe_training_file(args.i, targets_file, subtokens_file) if args.preset == Preset.variable: print("Variable preset") subtoken_tokenizer = SentencePieceBPETokenizer() target_tokenizer = SentencePieceBPETokenizer() print(f"Training subtoken tokenizer") subtoken_tokenizer.add_special_tokens(subtoken_special_tokens) print(f"Training target tokenizer") target_tokenizer.add_special_tokens(target_special_tokens) target_tokenizer.train(files=[targets_file], vocab_size=args.target_vocab) subtoken_tokenizer.train(files=[subtokens_file], vocab_size=args.subtoken_vocab) target_tokenizer.save(outdir, "target.bpe") subtoken_tokenizer.save(outdir, "subtoken.bpe")
def __init__(self, args: Namespace): super().__init__() self.target_encoder = SentencePieceBPETokenizer( args.target_vocab, args.target_merges) self.subtoken_encoder = SentencePieceBPETokenizer( args.subtoken_vocab, args.subtoken_merges) # self.target_encoder.add_special_tokens( # [self.EOS_TOKEN, self.SOS_TOKEN, self.PAD_TOKEN] # ) # self.subtoken_encoder.add_special_tokens([self.EOS_TOKEN, self.PAD_TOKEN]) with open(args.node_dict, "rb") as f: self.node_to_index = pickle.load(f) self.index_to_node = {v: k for k, v in self.node_to_index.items()}
def test_train_from_iterator(self): text = ["A first sentence", "Another sentence", "And a last one"] tokenizer = SentencePieceBPETokenizer() tokenizer.train_from_iterator(text, show_progress=False) output = tokenizer.encode("A sentence") assert output.tokens == ["▁A", "▁sentence"]
def load(vocab_file=None): if not os.path.exists(vocab_file): raise Exception("{} is not exist".format(vocab_file)) path, filename = os.path.split(vocab_file) ttype = filename.split("_")[0] merges_file = os.path.join( path, filename.replace("vocab.json", "merges.txt")) if ttype == "byte": if not os.path.exists(merges_file): raise Exception("{} is not exist".format(merges_file)) tokenizer = ByteLevelBPETokenizer( add_prefix_space=True, # required lowercase=True, # required unicode_normalizer=None, # required vocab_file=vocab_file, merges_file=merges_file, dropout=None, continuing_subword_prefix=None, end_of_word_suffix=None) elif ttype == "char": if not os.path.exists(merges_file): raise Exception("{} is not exist".format(merges_file)) tokenizer = CharBPETokenizer( unk_token=unk_token, # required suffix=suffix_token, # required lowercase=True, # required unicode_normalizer=None, # required vocab_file=vocab_file, merges_file=merges_file, dropout=None) elif ttype == "bert": tokenizer = BertWordPieceTokenizer( clean_text=True, # required handle_chinese_chars=True, # required strip_accents=True, # required lowercase=True, # required vocab_file=vocab_file, # add_special_tokens=True, unk_token=BUNK, sep_token=BSEP, cls_token=BCLS, wordpieces_prefix=BPRE) elif ttype == "sent": if not os.path.exists(merges_file): raise Exception("{} is not exist".format(merges_file)) tokenizer = SentencePieceBPETokenizer( add_prefix_space=True, # required unk_token=unk_token, replacement=rep_token, vocab_file=vocab_file, merges_file=merges_file, dropout=None) else: raise Exception("Not implement yet") return tokenizer
def train_tokenizer(self, paths: List[str], vocab_size: int, to_save_dir: str, languages: Dict[str, int]): self.tokenizer = SentencePieceBPETokenizer() self.init_properties(languages) self.tokenizer.train(files=paths, vocab_size=vocab_size, min_frequency=5, special_tokens=self.special_tokens) self.save(directory=to_save_dir)
def __init__(self, dataset_folder, tokenizer_method): self.dataset_folder = dataset_folder self.tokenizer_method = tokenizer_method if tokenizer_method == "sentencepiece": self.tokenizer = SentencePieceBPETokenizer( "./data/sentencepiece_tokenizer/vocab.json", "./data/sentencepiece_tokenizer/merges.txt") elif tokenizer_method == "bert": self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def main(): #argparser parser = argparse.ArgumentParser( prog="train_mlm_camembert_thai.py", description="train mlm for Camembert with huggingface Trainer", ) #required parser.add_argument("--bpe_tokenizer", type=str, default='sentencepiece', help='Specify the name of BPE Tokenizer') parser.add_argument("--vocab_size", type=int, default=52000) parser.add_argument("--min_frequency", type=int, default=2) parser.add_argument( "--train_dir", type=str, ) parser.add_argument( "--output_dir", type=str, ) parser.add_argument("--ext", type=str, default='.txt') args = parser.parse_args() fnames = [str(x) for x in glob.glob(f"{args.train_dir}/*{args.ext}")] # Initialize a tokenizer if args.bpe_tokenizer == 'byte_level': _BPE_TOKENIZER = ByteLevelBPETokenizer() if args.bpe_tokenizer == 'char': _BPE_TOKENIZER = CharBPETokenizer() if args.bpe_tokenizer == 'sentencepiece': _BPE_TOKENIZER = SentencePieceBPETokenizer() tokenizer = _BPE_TOKENIZER # Customize training tokenizer.train(files=fnames, vocab_size=args.vocab_size, min_frequency=args.min_frequency, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # Save files to disk tokenizer.save_model(args.output_dir) #test tokenizer = CamembertTokenizer.from_pretrained(args.output_dir) print(tokenizer.encode_plus('สวัสดีครับ hello world'))
def get_default_tokenizer(): from tokenizers import SentencePieceBPETokenizer tokenizer = SentencePieceBPETokenizer(path.join(VOCAB_PATH, 'en-vocab.json'), path.join(VOCAB_PATH, 'en-merges.txt'), unk_token='[UNK]') return tokenizer
def __init__(self, tok_model_path: Optional[str] = None): self.languages = {} if tok_model_path is not None: self.tokenizer = SentencePieceBPETokenizer( tok_model_path + "/vocab.json", tok_model_path + "/merges.txt", ) with open(os.path.join(tok_model_path, "langs"), "rb") as fp: self.languages: Dict[str, int] = pickle.load(fp) self.init_properties(self.languages)
def __init__(self, max_meta_len: int, max_body_len: int, ignore_meta_prob: float): tokenizer = SentencePieceBPETokenizer(vocab_file=str(_VOCAB), merges_file=str(_MERGES)) super().__init__(tokenizer=tokenizer, max_meta_len=max_meta_len, max_body_len=max_body_len, ignore_meta_prob=ignore_meta_prob, pad_token='<pad>')
def __init__(self, path, max_tokens): self.logger = log.getLogger("Tokenizer") self.logger.info("loading tokenizer") self.logger.info("path: " + path) self.logger.info("max_tokens: " + str(max_tokens)) self.tokenizer = SentencePieceBPETokenizer( os.path.join(path, "vocab.json"), os.path.join(path, "merges.txt")) self.max_tokens = max_tokens self.idx = {} for s in ['</s>', '<s>', '<pad>']: self.idx[s] = self.tokenizer.token_to_id(s)
def fit_on_text(self, text): if self.lower: text = text.lower() words = text.split() tokenizer1 = SentencePieceBPETokenizer(vocab, merges) for word in words: for sub_word in tokenizer1.encode(word).tokens: if sub_word not in self.word2idx: self.word2idx[sub_word] = self.idx self.idx2word[self.idx] = sub_word self.idx += 1
def train(corpus_list, vocab_size, output, output_name=None): print("create tokenizer...") tokenizer = SentencePieceBPETokenizer() print("load corpus list...") corpus_list = open(corpus_list).read().split('\n')[:-1] print("train tokenizer...") tokenizer.train( corpus_list, vocab_size=vocab_size, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]) print("save model...") tokenizer.save_model(output, output_name)
def configure(self): if isinstance(SentencePieceBPETokenizer, UnsupportedPackage): SentencePieceBPETokenizer.raise_error(self.__provider__) self.tokenizer = SentencePieceBPETokenizer( str(self.get_value_from_config('vocabulary_file')), str(self.get_value_from_config('merges_file'))) self.add_extra_symbols = self.get_value_from_config( 'add_extra_symbols') self.idx = {} for s in ['sos', 'eos']: self.idx[s] = self.tokenizer.token_to_id( str(self.get_value_from_config(s + '_symbol')))
def configure(self): if isinstance(SentencePieceBPETokenizer, UnsupportedPackage): SentencePieceBPETokenizer.raise_error(self.__provider__) self.tokenizer = SentencePieceBPETokenizer( str(self.get_value_from_config('vocabulary_file')), str(self.get_value_from_config('merges_file'))) self.remove_extra_symbols = self.get_value_from_config( 'remove_extra_symbols') self.idx = {} for s in ['sos', 'eos', 'pad']: self.idx[s] = str(self.get_value_from_config(s + '_symbol')) self.output_name = self.get_value_from_config('output_name') self.output_checked = False
def __init__(self, path, max_tokens): self.logger = log.getLogger("Tokenizer") self.logger.info("loading tokenizer") self.logger.info(f"path: {path}") self.logger.info(f"max_tokens: {max_tokens}") self.tokenizer = SentencePieceBPETokenizer( str(path / "vocab.json"), str(path / "merges.txt"), ) self.max_tokens = max_tokens self.idx = {} for s in ['</s>', '<s>', '<pad>']: self.idx[s] = self.tokenizer.token_to_id(s)
def __init__(self, tok_type, unk_token, sep_token, cls_token, pad_token, mask_token): self.tok_type = tok_type if self.tok_type == 'bpe': self.tokenizer = ByteLevelBPETokenizer() elif self.tok_type == 'wordpiece': self.tokenizer = BertWordPieceTokenizer(unk_token=unk_token, sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, mask_token=mask_token) elif self.tok_type == 'sentencepiece': self.tokenizer = SentencePieceBPETokenizer(unk_token=unk_token)
def set_tokenizer(self): if self.storage_method == "raw": pass # Essentially keep it None. Important for exceptions elif self.storage_method == "bert": self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") elif self.storage_method == "roberta": self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base") elif self.storage_method == "token": self.tokenizer = SentencePieceBPETokenizer( os.path.join(self.tokenizer_path, "/vocab.json"), os.path.join(self.tokenizer_path, "merges.txt")) else: raise ValueError("Unknown storage method encountered!")
def __init__(self, args): self.args = args if self.args.type == "byte": self.tokenizer = ByteLevelBPETokenizer( add_prefix_space=True, # required lowercase=True, # required unicode_normalizer=None, # required vocab_file=None, merges_file=None, dropout=None, continuing_subword_prefix=None, end_of_word_suffix=None) elif self.args.type == "char": self.tokenizer = CharBPETokenizer( unk_token=unk_token, # required suffix=suffix_token, # required lowercase=True, # required unicode_normalizer=None, # required vocab_file=None, merges_file=None, dropout=None) elif self.args.type == "bert": self.tokenizer = BertWordPieceTokenizer( clean_text=True, # required handle_chinese_chars=True, # required strip_accents=True, # required lowercase=True, # required vocab_file=None, # add_special_tokens=True, unk_token=BUNK, sep_token=BSEP, cls_token=BCLS, wordpieces_prefix=BPRE) elif self.args.type == "sent": self.tokenizer = SentencePieceBPETokenizer( add_prefix_space=True, # required unk_token=unk_token, replacement=rep_token, vocab_file=None, merges_file=None, dropout=None) else: raise Exception("Not implement yet") pass
def main(): args = cmd_args() outdir = args.o if args.o else os.path.dirname(args.i) print( f"Training SentencePiece to create a vocabulary of size {args.vocab_size}" ) with tempfile.TemporaryDirectory() as tmp_dir: train_file = os.path.join(tmp_dir, "train.txt") create_bpe_training_file(args.i, train_file) tokenizer = SentencePieceBPETokenizer() tokenizer.train(files=[train_file], vocab_size=args.vocab_size) tokenizer.save(outdir, args.n)
def load_tokenizer(path, enable_truncation=True, enable_padding=True, max_length=512): tokenizer = SentencePieceBPETokenizer(os.path.join(path, "vocab.json"), os.path.join(path, "merges.txt")) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) if enable_truncation: tokenizer.enable_truncation(max_length=max_length) if enable_padding: tokenizer.enable_padding(pad_token="<pad>", pad_id=tokenizer.token_to_id("<pad>")) return tokenizer
def load_tokenizer(langpair: str) -> SentencePieceBPETokenizer: if langpair in ["en-de", "de-en", "ende", "deen", "ENDE", "EN-DE"]: langpair = "deen" tokenizer_dir = Path(__file__).parent.parent / "src" / "tokenizer" vocab_filepath = ( tokenizer_dir / f"sentencepiece_bpe_wmt14_{langpair}.tokenizer-vocab.json" ) merges_filepath = ( tokenizer_dir / f"sentencepiece_bpe_wmt14_{langpair}.tokenizer-merges.txt" ) tokenizer = SentencePieceBPETokenizer( vocab_file=str(vocab_filepath), merges_file=str(merges_filepath), ) return tokenizer
def build_bpe(vocab_size=10000): # Initialize a tokenizer tokenizer = SentencePieceBPETokenizer() #mypath = "../../Downloads/riksdagens_protokoll_1920-2020/annual" mypath = "../../Desktop/cood/python/machine-learning/old-school/markov-lstm-killer/data/fi" onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] print("ONL", onlyfiles) paths = [mypath + "/" + f for f in onlyfiles] #paths = paths[:5] # COPY FILES txts = [] for path, fname in zip(paths, onlyfiles): if path[-4:] == ".txt": localpath = "data/" + fname txts.append(localpath) infile = open(path) outfile = open(localpath, "w") for line in infile: clean_line = cleanup(line) + "\n" outfile.write(clean_line) outfile.close() # Then train it! #tokenizer.train([ "../../Downloads/riksdagens_protokoll_1920-2020/annual/prot_2019.txt" ], vocab_size=15000) tokenizer.train(txts, vocab_size=vocab_size) # Now, let's use it: s = "Det politiska arbetet har redan börjat på olika sätt, med resor, besök, möten, politikutveckling, motionsskrivande och mycket annat. Jag har sett att ni redan har varit aktiva under ett antal veckor, och jag kan försäkra er att det även gäller talmanspresidiet. Nu är det dags att med tillförsikt påbörja ett nytt riksdagsår. Jag hoppas att ni alla ser fram emot det lika mycket som jag gör." #s = "Ite en oo viel mitää hyvää kyl sielt syöny." #s = "ja kieltämät siihe tommoste kokonaisii sanoi merkitsevät tavumerkit on huomattavasti näppärämpii ku ääniä tarkottavat aakkoset joist pitää rakentaa jokane sana" encoded = tokenizer.encode(s) print(encoded.ids) print(encoded.tokens) # And finally save it somewhere tokenizer.save("./bpe-fi.tokenizer.json")
def train_kenlm_language_model(input_data_paths, output_model_dir): output_model_dir = Path(output_model_dir) output_model_dir.mkdir(exist_ok=True, parents=True) output_model_path = output_model_dir / 'kenlm_model.arpa' with log_action('Training tokenizer'): tokenizer = SentencePieceBPETokenizer() tokenizer.train([str(path) for path in input_data_paths], vocab_size=20000) tokenizer.save(str(output_model_dir), 'spm_tokenizer') with log_action('Tokenizing'): tokenized_data_paths = get_temp_filepaths(len(input_data_paths)) for tokenized_data_path, input_data_path in zip(tokenized_data_paths, input_data_paths): encodings = tokenizer.encode_batch(read_lines(input_data_path)) write_lines([' '.join(encoding.tokens) for encoding in encodings], tokenized_data_path) with log_action('Training language model'): kenlm_path = input('Please provide the path to the lmplz script (install at https://github.com/kpu/kenlm): ') command = ( f'cat {" ".join([str(path) for path in tokenized_data_paths])} | {kenlm_path} -o 3 > {output_model_path}' ) run_command(command, mute=False) [path.unlink() for path in tokenized_data_paths] return output_model_dir
def __init__(self, vocab_file, merges_file, unk_token="<unk>", bos_token="<s>", eos_token="</s>", pad_token="<pad>", add_prefix_space=False, **kwargs): super().__init__( SentencePieceBPETokenizer( vocab_file=vocab_file, merges_file=merges_file, add_prefix_space=add_prefix_space, ), bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, **kwargs, )
def __init__(self, fname, tokenizer,dat_fname): if os.path.exists(dat_fname): print('loading dataset:', dat_fname) self.data = pickle.load(open(dat_fname, 'rb')) else: tokenizer1 = SentencePieceBPETokenizer(vocab, merges) reader = pd.read_excel(fname) all_data = [] for i in range(reader.shape[0]): text_raw1=[] text_raw2=[] column_name1 = tokenizer1.encode(reader.iloc[i][0].lower().strip()).tokens [text_raw1.extend(tokenizer1.encode(x).tokens) for x in reader.iloc[i][2].lower().strip().split(' ')] column_name2 = tokenizer1.encode(reader.iloc[i][1].lower().strip()).tokens [text_raw2.extend(tokenizer1.encode(x).tokens) for x in reader.iloc[i][3].lower().strip().split(' ')] class_n = reader.iloc[i][4] text_raw_indices1 = tokenizer.text_to_sequence(text_raw1) aspect_indices1 = tokenizer.text_to_sequence(column_name1) text_raw_indices2 = tokenizer.text_to_sequence(text_raw2) aspect_indices2 = tokenizer.text_to_sequence(column_name2) data = { 'text_raw_indices1': text_raw_indices1, 'aspect_indices1': aspect_indices1, 'text_raw_indices2': text_raw_indices2, 'aspect_indices2': aspect_indices2, 'class_n': int(class_n), } all_data.append(data) self.data = all_data pickle.dump(self.data, open(dat_fname, 'wb')) print("Finished write data file")
STORAGE_BUCKET = "gs://sbt0" # for prefix in prefixes: # input_dir_gs = os.path.join( # STORAGE_BUCKET, # "data/corpus/%s_lower/zhwiki-latest-pages-articles_%s_lower.txt" % (prefix, prefix) # ) # input_dir_local = "./zhwiki-latest-pages-articles_%s_lower.txt" % prefix # tf.gfile.Copy(input_dir_gs, input_dir_local, overwrite=True) for vocab_size in vocab_sizes: for prefix in prefixes: try: tokenizer_name = prefix + "_" + str(vocab_size) tokenizer = SentencePieceBPETokenizer() tokenizer.train( [ "./zhwiki-latest-pages-articles_%s_lower.txt" % prefix # "./zhwiki-latest-pages-articles_lower.txt" ], vocab_size=vocab_size, show_progress=True, min_frequency=1, special_tokens=[ "<unk>", "[SEP]", "[CLS]", "[PAD]", "[MASK]" ]) tokenizer.save("data_proc/tokenizers/sentencepiece", tokenizer_name)
data = load_both(args.reference, args.nbest) processed_ids = set() if args.c is not None: with jsonlines.open(args.c) as reader: for obj in reader: processed_ids.add(obj['id']) tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) model = AutoModelForMaskedLM.from_pretrained(model_name) if args.opi: from tokenizers import SentencePieceBPETokenizer from tokenizers.processors import RobertaProcessing tokenizer = SentencePieceBPETokenizer(f"{args.model}/vocab.json", f"{args.model}/merges.txt") getattr(tokenizer, "_tokenizer").post_processor = RobertaProcessing(sep=("</s>", 2), cls=("<s>", 0)) tokenizer.mask_token_id = model.roberta.embeddings.word_embeddings.weight.shape[0] - 1 # last is mask? model.eval() if device == 'cuda': if args.half: model.half() model.to(device) start = time.time() count = 0 with jsonlines.open(args.output, mode='w', flush=True) as writer: for id, utt in tqdm.tqdm(data.items(), desc="Texts"): if id in processed_ids: continue
import argparse from tokenizers import SentencePieceBPETokenizer from tokenizers.trainers import BpeTrainer parser = argparse.ArgumentParser() parser.add_argument("--corpus_file", type=str) parser.add_argument("--vocab_size", type=int, default=32000) parser.add_argument("--limit_alphabet", type=int, default=6000) args = parser.parse_args() tokenizer = SentencePieceBPETokenizer( vocab_file=None, clean_text=True, handle_chinese_chars=True, strip_accents=False, # Must be False if cased model lowercase=False, wordpieces_prefix="##") tokenizer.train(files=[args.corpus_file], limit_alphabet=args.limit_alphabet, vocab_size=args.vocab_size) tokenizer.save("./", "ch-{}-wpm-{}".format(args.limit_alphabet, args.vocab_size))
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the training files for the CoNLL-2003 NER task.", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) # Other parameters parser.add_argument( "--labels", default="", type=str, help= "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--keep_accents", action="store_const", const=True, help="Set this flag if model is trained with accents.") parser.add_argument( "--strip_accents", action="store_const", const=True, help="Set this flag if model is trained without accents.") parser.add_argument("--use_fast", action="store_const", const=True, help="Set this flag to use fast tokenization.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() print(args.model_name_or_path) if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Prepare CONLL-2003 task labels = get_labels(args.labels) num_labels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, id2label={str(i): label for i, label in enumerate(labels)}, label2id={label: i for i, label in enumerate(labels)}, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer_args = { k: v for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS } logger.info("Tokenizer arguments: %s", tokenizer_args) base_dir = r'/net/people/plgpgajdzica/scratch/ner/data/embeddings/bert/polish_roberta' tokenizer = SentencePieceBPETokenizer(os.path.join(base_dir, "vocab.json"), os.path.join(base_dir, "merges.txt")) tokenizer.enable_padding(pad_token="<pad>", pad_id=1, max_length=128) getattr(tokenizer, "_tokenizer").post_processor = RobertaProcessing(sep=("</s>", 2), cls=("<s>", 0)) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train") global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) # tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained(args.output_dir, **tokenizer_args) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("pytorch_transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step) if global_step: result = { "{}_{}".format(global_step, k): v for k, v in result.items() } results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) if args.do_predict and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained(args.output_dir, **tokenizer_args) model = model_class.from_pretrained(args.output_dir) model.to(args.device) result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test") # Save results output_test_results_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_results_file, "w") as writer: for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt") with open(output_test_predictions_file, "w") as writer: with open(os.path.join(args.data_dir, "test.txt"), "r") as f: example_id = 0 for line in f: if line.startswith( "-DOCSTART-") or line == "" or line == "\n": writer.write(line) if not predictions[example_id]: example_id += 1 elif predictions[example_id]: output_line = line.split( )[0] + " " + predictions[example_id].pop(0) + "\n" writer.write(output_line) else: logger.warning( "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]) return results