def prepare_tokenizer(args): config = BertConfig.from_json_file(args.config_file) # tokenizer = BertTokenizerFast(args.vocab_file, model_max_length=512) # print('config', type(config), config,) tokenizer = BertTokenizerFast( args.vocab_file, model_max_length=config.max_position_embeddings) return tokenizer
def load_pretrained_bert_tokenizer(vocab_file=None): """Create tokenizer from file, using Transformers library""" from transformers import BertTokenizerFast if vocab_file is None: vocab_file = os.path.join( os.path.dirname(os.path.abspath(__file__)), "bert-base-uncased-vocab.txt" ) tokenizer = BertTokenizerFast( vocab_file=vocab_file, # following arguments are all same as default, listed for clarity clean_text=True, tokenize_chinese_chars=True, do_lower_case=True, strip_accents=True, unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", ) return tokenizer
def init_tgt(params): """ Initialize the parameters of the target model """ prob = None if params.prob: print(' | load word translation probs!') prob = torch.load(params.prob) print(f'| load English pre-trained model: {params.src_model}') config = AutoConfig.from_pretrained(params.src_model, cache_dir=params.cache_dir) model = AutoModelForMaskedLM.from_pretrained( params.src_model, from_tf=bool(".ckpt" in params.src_model), config=config, cache_dir=params.cache_dir, ) if 'roberta' in params.src_model: assert params.src_merge, "merge file should be provided!" src_tokenizer = RobertaTokenizer(params.src_vocab, params.src_merge) else: # note that we do not lowercase here src_tokenizer = AutoTokenizer.from_pretrained( params.src_model, cache_dir='/home/georgios.vernikos/workspace/LMMT/MonoEgo/cache', use_fast=True) # get English word-embeddings and bias src_embs = model.base_model.embeddings.word_embeddings.weight.detach( ).clone() src_bias = model.cls.predictions.bias.detach().clone() # initialize target tokenizer, we always use BertWordPieceTokenizer for the target language tgt_tokenizer = BertTokenizerFast(vocab_file=params.tgt_vocab, do_lower_case=True, strip_accents=False) tgt_embs, tgt_bias = guess(src_embs, src_bias, tgt_tokenizer, src_tokenizer, prob=prob) # checksum for debugging purpose print(' checksum src | embeddings {:.5f} - bias {:.5f}'.format( src_embs.norm().item(), src_bias.norm().item())) model.base_model.embeddings.word_embeddings.weight.data = tgt_embs model.cls.predictions.bias.data = tgt_bias model.tie_weights() print(' checksum tgt | embeddings {:.5f} - bias {:.5f}'.format( model.base_model.embeddings.word_embeddings.weight.norm().item(), model.cls.predictions.bias.norm().item())) # save the model model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(params.tgt_model)
def get_bert_tokenizer(bert_model_type): if bert_model_type in [ 'bert-base-uncased', 'prod-bert-base-uncased', 'bert-base-cased', 'bert-large-uncased', 'tune_bert-base-uncased_nsp', 'bert-large-uncased-whole-word-masking', 'bert-large-uncased-whole-word-masking-finetuned-squad' ]: if '-cased' in bert_model_type: do_lower_case = False else: do_lower_case = True # default return BertTokenizerFast(vocab_file=BERT_VOCAB_FILE[bert_model_type], do_lower_case=do_lower_case) elif bert_model_type in [ 'roberta-base', 'prod-roberta-base-cased', 'roberta-large', 'roberta-large-mnli', 'distilroberta-base' ]: return RobertaTokenizerFast( vocab_file=BERT_VOCAB_FILE[bert_model_type], merges_file=BERT_MERGE_FILE[bert_model_type], add_prefix_space=True) elif bert_model_type in ['xlnet-base-cased']: if '-uncased' in bert_model_type: do_lower_case = True else: do_lower_case = False # default return XLNetTokenizer(vocab_file=BERT_VOCAB_FILE[bert_model_type], do_lower_case=do_lower_case) elif bert_model_type in [ 'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1', 'albert-xxlarge-v1' ]: return AlbertTokenizer(vocab_file=BERT_VOCAB_FILE[bert_model_type]) elif bert_model_type in ['gpt2', 'gpt2-medium']: tokenizer = GPT2TokenizerFast( vocab_file=BERT_VOCAB_FILE[bert_model_type], merges_file=BERT_MERGE_FILE[bert_model_type], add_prefix_space=True) # https://github.com/huggingface/transformers/issues/3859 tokenizer.pad_token = tokenizer.eos_token return tokenizer elif bert_model_type in ['transfo-xl']: return TransfoXLTokenizerFast( vocab_file=BERT_VOCAB_FILE[bert_model_type]) elif bert_model_type in [ 'distilbert-base-uncased', 'distilbert-base-uncased-distilled-squad' ]: if '-cased' in bert_model_type: do_lower_case = False else: do_lower_case = True # default return DistilBertTokenizerFast( vocab_file=BERT_VOCAB_FILE[bert_model_type], do_lower_case=do_lower_case) else: raise ValueError( f'`bert_model_type` not understood: {bert_model_type}')
def get_tokenizer(vocab_file): tokenizer = BertTokenizerFast( vocab_file=vocab_file, do_basic_tokenize=True ) special_tokens_dict = {'additional_special_tokens': ["<end>", "<begin>"]} tokenizer.add_special_tokens(special_tokens_dict) return tokenizer
def __init__(self, output_fname, vocab_file, max_seq_length, blanks_separate_docs, do_lower_case): self._blanks_separate_docs = blanks_separate_docs tokenizer = BertTokenizerFast(vocab_file=vocab_file, do_lower_case=do_lower_case) self._example_builder = ExampleBuilder(tokenizer, max_seq_length) self._writers = [] self._wd = tf.io.TFRecordWriter(output_fname) self.n_written = 0
def preprocess_corpus_fixed_lines(ln, mapper, max_seq, lines_limit): """ Merge/separate lines so that every line is made of approximately max_seq tokens. Create: - a new baseline corpus, lowercased. - a new cID corpus, made of cID-strings. """ original_corpus_path = get_corpus(ln, dense=False, cid=False) dense_corpus_path = get_corpus(ln, dense=True, cid=False) cID_dense_corpus_path = get_corpus(ln, dense=False, cid=True) with open(original_corpus_path, "r", encoding='utf-8') as original_corpus, \ open(dense_corpus_path, "x", encoding='utf-8') as dense_corpus, \ open(cID_dense_corpus_path, "x", encoding='utf-8') as cID_dense_corpus: lines_list = original_corpus.read().splitlines() tokenizer = BertTokenizerFast(Path(args.icebert_folder) / args.monolingual_tokenizers_root_path / (ln + '.txt'), do_lower_case=False, add_special_tokens=True) sentence_tokenizer = NLTKSegmenter() dense_line = [] cID_dense_line = [] line_length = 0 # avoid reading beyond the EOF line_index = 0 number_of_dense_lines = 0 tot_lines = len(lines_list) while number_of_dense_lines < lines_limit: # reset the lines counter if line_index == tot_lines: line_index = 0 sentences = sentence_tokenizer.segment_string( lines_list[line_index], lowercase=args.lowercase_corpus) line_index += 1 # we work at sentence level (not line level, to avoid cutting very long lines) for sentence in sentences: cIDs = encode_cID( fast_tokenize(sentence, ln, tokenizer, mark=True), mapper) line_length += len(cIDs) dense_line.append(sentence.strip()) cID_dense_line.append(" ".join(cIDs).strip()) # if we reach the maximum number of tokens, we wrote down the dense sentence and start building a new one. if line_length > max_seq: dense_corpus.write(" ".join(dense_line) + "\n") cID_dense_corpus.write(" ".join(cID_dense_line) + "\n") number_of_dense_lines += 1 dense_line = [] cID_dense_line = [] line_length = 0 return
def test_export_custom_bert_model(self): from transformers import BertModel vocab = ["[UNK]", "[SEP]", "[CLS]", "[PAD]", "[MASK]", "some", "other", "words"] with NamedTemporaryFile(mode="w+t") as vocab_file: vocab_file.write("\n".join(vocab)) vocab_file.flush() tokenizer = BertTokenizerFast(vocab_file.name) with TemporaryDirectory() as bert_save_dir: model = BertModel(BertConfig(vocab_size=len(vocab))) model.save_pretrained(bert_save_dir) self._test_export(bert_save_dir, "pt", 12, tokenizer)
def build_model(args): if args.pretrained_path == '': config = GPT2Config.from_json_file(args.model_config) model = GPT2LMHeadModel(config) tokenizer = BertTokenizerFast(args.vocab) # XXX: must add this, or can't tokenize special token in string to single char tokenizer.sanitize_special_tokens() info = None else: config = GPT2Config.from_pretrained(args.pretrained_path) model, info = GPT2LMHeadModel.from_pretrained(args.pretrained_path, config=config, output_loading_info=True) tokenizer = BertTokenizerFast.from_pretrained(args.pretrained_path) return model, tokenizer, info
def __init__(self, vocab_path, strip_accents, clean_text, lowercase, from_pretrained=False): common_params = { 'strip_accents': strip_accents, 'clean_text': clean_text, 'lowercase': lowercase } if from_pretrained: self._tokenizer = BertTokenizerFast.from_pretrained( pretrained_model_name_or_path=vocab_path, **common_params) else: self._tokenizer = BertTokenizerFast(vocab_file=vocab_path, **common_params)
def main(): params = parser.parse_args() params.lowercase = params.lowercase == 'True' print(params) model_name = 'bert-base-uncased' if params.lowercase else 'bert-base-cased' print(model_name) src_tokenizer = AutoTokenizer.from_pretrained( model_name, cache_dir='/home/georgios.vernikos/workspace/LMMT/MonoEgo/cache', use_fast=True) tgt_tokenizer = BertTokenizerFast(vocab_file=params.tgt_vocab, do_lower_case=params.lowercase, strip_accents=False) src_embs, src_subwords = get_subword_embeddings(src_tokenizer, params.src_aligned_vec, params.topn, params.lowercase) tgt_embs, tgt_subwords = get_subword_embeddings(tgt_tokenizer, params.tgt_aligned_vec, params.topn, params.lowercase) src_embs = renorm(src_embs, 1) tgt_embs = renorm(tgt_embs, 1) # initialize sparse-max sparsemax = Sparsemax(1) print(f'| # src subwords founds: {len(src_subwords)}') print(f'| # tgt subwords founds: {len(tgt_subwords)}') print('| compute translation probability') scores = tgt_embs @ src_embs.t() a = sparsemax(scores) # (Vf, Ve) print('| generating translation table!') probs = {} for i, tt in tqdm(enumerate(tgt_subwords), total=len(tgt_subwords)): probs[tt] = {} ix = torch.nonzero(a[i]).view(-1) px = a[i][ix].tolist() wx = [src_subwords[j] for j in ix.tolist()] probs[tt] = {w: p for w, p in zip(wx, px)} n_avg = np.mean([len(ss) for ss in probs.values()]) print(f'| average # source / target: {n_avg:.2f} ') print(f"| save translation probabilities: {params.save}") torch.save(probs, params.save)
def main(): parser = argparse.ArgumentParser() # creating an ArgumentParser object # input data and model directories parser.add_argument('--config', type=str, required=True) parser.add_argument('--output', type=int, required=True) parser.add_argument('--thread', type=int, required=True) parser.add_argument('--language', type=str, required=True) args, _ = parser.parse_known_args() with open(args.config) as f: # print(json.load(f)["records"]) config = json.load(f) files_paths = config["paths"] record_params = config["records"] seed_input = config["seed"] language = args.language character_tokenizer = BertTokenizerFast( Path(files_paths["vocab_file_root"]) / language / "alphabet", do_lower_case=False) ocr_errors_generator = ErrorTable(character_tokenizer) with open(Path(files_paths["ocr_errors_root"]) / language / "ocr_errors.txt", encoding="utf-8") as f: ocr_errors_generator.load_table_from_file(f) dataset_generator = CorrectionDatasetGenerator( character_tokenizer, ocr_errors_generator, record_params["sequence_length"]) output_dir = Path(files_paths["output_file_root"]) / language tf.io.gfile.makedirs(str(output_dir)) writer = tf.io.TFRecordWriter(str(output_dir / f"tf_record_{args.output}"), options="GZIP") logging.basicConfig(level=logging.INFO) inst_idx = 0 start_time = time.time() for repeat in range(record_params["dupe_factor"]): example_cache = [] for inputs, outputs in dataset_generator.generate_dataset( files_paths["dataset_dir"], args.thread, seed_input): # for inputs, outputs in tqdm.tqdm(dataset_generator.generate_dataset(files_paths["test_input_dir"])): inst_idx += 1 break
def get_tokenizer() -> BertTokenizerFast: """ Returns tokenizer for that model. Parameters: None Returns: tokenizer (BertTokenizerFast) : loaded and set tokenizer. """ # Loading bert from tf hub print(f"\nTrying to load BERT layer from {BERT_LAYER_HUB_URL}\n") bert_layer = tf_hub.KerasLayer(BERT_LAYER_HUB_URL, trainable=False) print(f"\nLoaded BERT layer from {BERT_LAYER_HUB_URL}") # Getting vocab from layer vocab_path = bert_layer.resolved_object.vocab_file.asset_path.numpy().decode("utf-8") # Creating new tokenizer tokenizer = BertTokenizerFast(vocab_path) return tokenizer
def __init__(self, train=True): if train: path = ("/data/data_train.txt", "/data/pos_train.txt") else: path = ("/data/data_val.txt", "/data/pos_val.txt") self.tokenizer = BertTokenizerFast("wiki-vocab.txt") self.paragraphs = [[]] self.pos_labels = set([]) valid = True with open(path[0], encoding="utf-8") as f_data: with open(path[1], encoding="utf-8") as f_pos: for d, p in tqdm(zip(f_data, f_pos), desc="load_data"): if len(d.strip()) == 0: if len(self.paragraphs[-1]) > 0: self.paragraphs.append([]) else: valid = True elif valid: _d, _p = d.strip().split(), p.strip().split() if len(_d) != len(_p) or len(_p) > 256: valid = False self.paragraphs[-1] = [] else: assert len(_d) == len(_p), f"{len(_d)} {len(_p)}" self.paragraphs[-1].append((_d, _p)) self.pos_labels |= set(_p) print(len(self.paragraphs)) if train: self.pos_labels_to_ids = {} for i, pos_label in enumerate(sorted(self.pos_labels)): self.pos_labels_to_ids[pos_label] = i + 1 else: with open('./pretrain.wiki.dict') as f: self.pos_labels_to_ids = eval(f.read()) i = len(self.pos_labels_to_ids) for _, pos_label in enumerate(sorted(self.pos_labels)): if pos_label not in self.pos_labels_to_ids: self.pos_labels_to_ids[pos_label] = i i += 1
def __init__(self, config): super().__init__(config) self.tokenizer = BertTokenizerFast("../Bert/assets/vocab.txt") self.num_labels = config.num_labels self.bert = BertModel(config) self.cls = BertOnlyMLMHead(config) # projected_emb = tf.layers.dense(output_layer, params["projection_size"]) # projected_emb = tf.keras.layers.LayerNormalization(axis=-1)(projected_emb) # if is_training: # projected_emb = tf.nn.dropout(projected_emb, rate=0.1) self.dense = nn.Linear(config.hidden_size, 128) self.LayerNorm = nn.LayerNorm(128) self.projected_emb = nn.Dropout(0.1)
def create_dense_files(original_corpus_path, dense_corpus_path, cID_dense_corpus_path, ln, mapper, max_seq): with open(original_corpus_path, "r", encoding='utf-8') as original_corpus, \ open(dense_corpus_path, "x", encoding='utf-8') as dense_corpus, \ open(cID_dense_corpus_path, "x", encoding='utf-8') as cID_dense_corpus: lines_list = original_corpus.read().splitlines() tokenizer = BertTokenizerFast( Path(args.icebert_folder) / args.monolingual_tokenizers_root_path / (ln + '.txt'), do_lower_case=False, add_special_tokens=True) sentence_tokenizer = NLTKSegmenter() dense_line = [] cID_dense_line = [] line_length = 0 marked_lines = 0 for line in tqdm(lines_list): if not (line[:6] == "</doc>" or line[:4] == "<doc"): sentences = sentence_tokenizer.segment_string( line, lowercase=args.lowercase_corpus) # we work at sentence level (not line level, to avoid cutting very long lines) for sentence in sentences: cIDs = encode_cID( fast_tokenize(sentence, ln, tokenizer, mark=True), mapper) line_length += len(cIDs) dense_line.append(sentence.strip()) cID_dense_line.append(" ".join(cIDs).strip()) # if we reach the maximum number of tokens, we wrote down the dense sentence and start building a new one. if line_length > max_seq: dense_corpus.write(" ".join(dense_line) + "\n") cID_dense_corpus.write(" ".join(cID_dense_line) + "\n") dense_line = [] cID_dense_line = [] line_length = 0 else: marked_lines += 1 return marked_lines
def test_realm_tokenizer(vocabfile): tokenizer = BertTokenizerFast(vocabfile) tens_tokenizer = bert.tokenization.FullTokenizer(vocab_file=vocabfile, do_lower_case=True) dataset = NQ().get_train_data() print( "\nTesting the similarity of the Tensorflow & Pytorch tokenizer using NQ" ) for text in tqdm(dataset): if (not tens_tokenizer.convert_tokens_to_ids( ['[CLS]'] + tens_tokenizer.tokenize(text['question']) + ['[SEP]']) == tokenizer(text['question'])['input_ids']): raise Exception( "The Tensorflow tokenizer and the Pytorch tokenizer don't have similar matches: {} and {}" .format( tens_tokenizer.convert_tokens_to_ids( ['[CLS]'] + tens_tokenizer.tokenize(text['question']) + ['[SEP]']), tokenizer(text['question'])['input_ids'])) print("Tokenizer is correctly imported")
def line_tokenizer_reader(args, file_queue: Queue, line_counter: Counter, file_counter: Counter): blanks_separate_docs = args.blanks_separate_docs # tokenizer = tokenization.FullTokenizer( # vocab_file=args.vocab_file, # do_lower_case=args.do_lower_case) # tokenizer = BertTokenizerFast(vocab_file=args.vocab_file, # do_lower_case=args.do_lower_case, # unk_token='[UNK]', # tokenize_chinese_chars=False, # wordpieces_prefix='##') tokenizer = BertTokenizerFast(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case, unk_token='[UNK]', sep_token='[SEP]', strip_accents=False, clean_text=True, tokenize_chinese_chars=False, wordpieces_prefix='##') example_builder = ExampleBuilder(tokenizer, args.max_seq_length) output_filename_template = os.path.join( args.output_dir, "pretrain_data-{:04d}.tfrecord.lz") sent_messages = 0 logger = logging.getLogger() TWO_HUNDREND_MB = 400e6 writer = TFWriter(TWO_HUNDREND_MB, output_filename_template, file_counter) def send(example): if not example: return 0 while not shutdown_event.is_set(): try: #message_queue.put(example.SerializeToString(), block=True, timeout=0.05) writer.write(example.SerializeToString()) break except queue.Full: logger.warning('queue is full') time.sleep(0.01) continue writer.close() return 1 while not shutdown_event.is_set(): try: input_file = file_queue.get(block=True, timeout=0.05) except EOFError: continue except queue.Empty: continue except ValueError: break try: lines_read = 0 start = time.time() bytes_read = 0 already_sent = sent_messages previous_reading = 0 with tf.io.gfile.GFile(input_file) as f: for line in f: bytes_read += len(line) line = line.strip() if line or blanks_separate_docs: sent_messages += send(example_builder.add_line(line)) lines_read += 1 elapsed = time.time() if lines_read % 100: line_counter.increment(lines_read - previous_reading) previous_reading = lines_read logger.info('reading %.1f lines/sec', lines_read / (elapsed - start)) logger.info('read %s bytes', humanize.filesize.naturalsize(bytes_read)) logger.info('sending %.1f messages/sec', (sent_messages - already_sent) / (elapsed - start)) sent_messages += send(example_builder.add_line("")) except Exception as exc: import traceback traceback.print_exc() logger.error('reading file %r %s', exc, input_file) return sent_messages
def main(): parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") print("Config before overwrite max_position_embeddings:", config) config.max_position_embeddings = 4096 config.num_hidden_layers = 6 config.num_attention_heads = 8 config.hidden_size = 512 config.intermediate_size = 2048 print("Config after overwrite max_position_embeddings:", config) # if model_args.tokenizer_name: # tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir) # elif model_args.model_name_or_path: # tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) # else: # raise ValueError( # "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," # "and load it from here, using --tokenizer_name" # ) logging.info("Loading tokenizer") if model_args.tokenizer_name: tokenizer = BertTokenizerFast(model_args.tokenizer_name, clean_text=True, lowercase=False, strip_accents=True) else: raise ValueError("Specify tokenizer name") logging.info("Loading model") if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) logging.info("Resizing embeddings") model.resize_token_embeddings(len(tokenizer)) print(len(tokenizer.get_vocab()), len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling).") # Get datasets logging.info("Loading train dataset") train_dataset = get_dataset(data_args) if training_args.do_train else None logging.info("Loading eval dataset") eval_dataset = (get_dataset( data_args, evaluate=True, ) if training_args.do_eval else None) if config.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability, ) # Initialize our Trainer logging.info("Initializing trainer") trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: logging.info("Training") model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
def main(): # 初始化参数 args = set_args() # 设置使用哪些显卡进行训练 os.environ["CUDA_VISIBLE_DEVICES"] = args.device args.cuda = not args.no_cuda if args.batch_size < 2048 and args.warmup_steps <= 4000: print('[Warning] The warmup steps may be not enough.\n' \ '(sz_b, warmup) = (2048, 4000) is the official setting.\n' \ 'Using smaller batch w/o longer warmup may cause ' \ 'the warmup stage ends with only little data trained.') # 创建日志对象 logger = create_logger(args) # 当用户使用GPU,并且GPU可用时 args.cuda = torch.cuda.is_available() and not args.no_cuda device = 'cuda:0' if args.cuda else 'cpu' args.device = device logger.info('using device:{}'.format(device)) # 初始化tokenizer tokenizer = BertTokenizerFast(vocab_file=args.vocab_path, sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]") args.sep_id = tokenizer.sep_token_id args.pad_id = tokenizer.pad_token_id args.cls_id = tokenizer.cls_token_id # 创建模型的输出目录 if not os.path.exists(args.save_model_path): os.mkdir(args.save_model_path) # 创建模型 if args.pretrained_model: # 加载预训练模型 model = GPT2LMHeadModel.from_pretrained(args.pretrained_model) else: # 初始化模型 model_config = GPT2Config.from_json_file(args.model_config) model = GPT2LMHeadModel(config=model_config) model = model.to(device) logger.info('model config:\n{}'.format(model.config.to_json_string())) assert model.config.vocab_size == tokenizer.vocab_size # 并行训练模型 if args.cuda and torch.cuda.device_count() > 1: model = DataParallel(model).cuda() # model = BalancedDataParallel(args.gpu0_bsz, model, dim=0).cuda() logger.info("use GPU {} to train".format(args.device)) # 计算模型参数数量 num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() logger.info('number of model parameters: {}'.format(num_parameters)) # 记录参数设置 logger.info("args:{}".format(args)) # 加载训练集和验证集 # ========= Loading Dataset ========= # train_dataset, validate_dataset = load_dataset(logger, args) train(model, logger, train_dataset, validate_dataset, args)
lowercase=False, ) wp_tokenizer.train( files='/opt/ml/code/KBOBERT/KBOBERT_Data.txt', vocab_size=32000, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], wordpieces_prefix="##") wp_tokenizer.save_model('./') tokenizer = BertTokenizerFast( vocab_file="/opt/ml/code/KBOBERT/vocab.txt", max_len=512, do_lower_case=False, ) tokenizer.add_special_tokens({'mask_token': '[MASK]'}) # https://huggingface.co/transformers/model_doc/bert.html#bertconfig config = BertConfig(vocab_size=32000, hidden_size=256, num_hidden_layers=6, num_attention_heads=4, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1,
def __init__(self, train=True): if train: path = "/data/KorQuAD_v1.0_train.json" db_name = "korquad_train.qas" else: path = "/data/KorQuAD_v1.0_dev.json" db_name = "korquad_dev.qas" self.tokenizer = BertTokenizerFast("wiki-vocab.txt") data = json.load(open(path, encoding="utf-8"))["data"] self.qas = [] if not os.path.exists(db_name): with open(db_name, "wb") as f: self.mecab = Mecab() ignored_cnt = 0 for paragraphs in tqdm(data): paragraphs = paragraphs["paragraphs"] for paragraph in paragraphs: _context = paragraph["context"] for qa in paragraph["qas"]: question = qa["question"] answer = qa["answers"][0]["text"] ( input_ids, token_type_ids, start_token_pos, end_token_pos, ) = self.extract_features( _context, question, answer, qa["answers"][0]["answer_start"], ) if len(input_ids) > 512: if not train: pickle.dump( ( input_ids, token_type_ids, start_token_pos, end_token_pos, ), f, ) else: if train: pickle.dump( ( input_ids, token_type_ids, start_token_pos, end_token_pos, ), f, ) else: pickle.dump( ( input_ids, token_type_ids, start_token_pos, end_token_pos, ), f, ) with open(db_name, "rb") as f: while True: try: data = pickle.load(f) self.qas.append(data) except EOFError: break print(len(self.qas))
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ # **YD** add BertTokenizerFast to be suitable for CONLL2003 NER task, pipeline is similar to # https://github.com/huggingface/transformers/tree/master/examples/token-classification # 1.obtain tokenizer and data_collator tokenizer = BertTokenizerFast(args.dict) data_collator = YD_DataCollatorForELClassification( tokenizer, max_length=args.max_pred_length, padding=True) # 2. process datasets, (tokenization of NER data) # **YD**, add args in option.py for fine-tuning task data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file if args.test_file is not None: data_files["test"] = args.test_file extension = args.extension_file dataset = datasets.load_dataset(extension, data_files=data_files) # 3. setup num_labels if 'train' in dataset: column_names = dataset["train"].column_names features = dataset["train"].features elif 'validation' in dataset: column_names = dataset["validation"].column_names features = dataset["validation"].features elif 'test' in dataset: column_names = dataset["test"].column_names features = dataset["test"].features else: raise ValueError( 'dataset must contain "train"/"validation"/"test"') text_column_name = 'tokens' label_column_name = 'ner_tags' entity_column_name = 'entity_names' assert text_column_name in column_names assert label_column_name in column_names assert entity_column_name in column_names if isinstance(features[label_column_name].feature, ClassLabel): label_list = features[label_column_name].feature.names # No need to convert the labels since they are already ints. label_to_id = {i: i for i in range(len(label_list))} else: if 'train' in label_column_name: label_list = get_label_list( datasets["train"][label_column_name]) elif 'validation' in label_column_name: label_list = get_label_list( datasets["validation"][label_column_name]) elif 'test' in label_column_name: label_list = get_label_list( datasets["test"][label_column_name]) else: raise ValueError( 'dataset must contain "train"/"validation"/"test"') label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) # **YD** preparing ent_name_id from deep_ed to # transform (entity name or entity wikiid) to thid (entity embedding lookup index) ent_name_id = EntNameID(args) # 4. tokenization # Tokenize all texts and align the labels with them. # Tokenize all texts and align the **NER_labels and Entity_labels** with them. # **YD** only mention (GT label= 'B' or 'I') is considered to do entity disambiguation task. # in training time, if certain entity in dictionary, it is labeled with correct entity id. # if certain entity is not in dictionary, or certain mention has no corresponding entity, # it is labelled with incorrect entity. # in inference time, NER label together with ED label to do evaluation. # if certain token labels with 'B' and has not unknown predicted entity, it is predicted with entity. The mention # part is decided with the following 'I' label. # otherwise, if it has unknown predicted entity, all 'B' and following 'I' becomes 'O' label. def tokenize_and_align_labels(examples, label_all_tokens=False): tokenized_inputs = tokenizer( examples[text_column_name], padding=False, truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, return_offsets_mapping=True, ) # print('tokenized_inputs', tokenized_inputs) offset_mappings = tokenized_inputs.pop("offset_mapping") labels = [] entity_labels = [] for label, offset_mapping, entity_label in zip( examples[label_column_name], offset_mappings, examples[entity_column_name]): label_index = 0 current_label = -100 label_ids = [] current_entity_label = -100 entity_label_ids = [] for offset in offset_mapping: # We set the label for the first token of each word. # Special characters will have an offset of (0, 0) # so the test ignores them. if offset[0] == 0 and offset[1] != 0: current_label = label_to_id[label[label_index]] label_index += 1 label_ids.append(current_label) current_entity_label = entity_label[label_index - 1] if label[label_index - 1] == NER_LABEL_DICT['O']: current_entity_label = -100 else: # print(label[label_index-1]) # print(label_to_id) assert label[label_index - 1] == NER_LABEL_DICT['B'] or label[label_index - 1] == \ NER_LABEL_DICT['I'] if current_entity_label == _EMPTY_ENTITY_NAME or label[ label_index - 1] == NER_LABEL_DICT['I']: current_entity_label = -100 else: assert label[label_index - 1] == NER_LABEL_DICT['B'] tmp_label = ent_name_id.get_thid( ent_name_id.get_ent_wikiid_from_name( current_entity_label, True)) if tmp_label != ent_name_id.unk_ent_thid: current_entity_label = tmp_label else: current_entity_label = _OUT_DICT_ENTITY_ID entity_label_ids.append(current_entity_label) # For special tokens, we set the label to -100 so it's automatically ignored in the loss function. elif offset[0] == 0 and offset[1] == 0: label_ids.append(-100) entity_label_ids.append(-100) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append( current_label if label_all_tokens else -100) entity_label_ids.append( current_entity_label if label_all_tokens else -100) labels.append(label_ids) entity_labels.append(entity_label_ids) tokenized_inputs["labels"] = labels tokenized_inputs["entity_labels"] = entity_labels return tokenized_inputs tokenized_datasets = dataset.map( tokenize_and_align_labels, batched=True, num_proc=1, # set 1 for faster processing load_from_cache_file=False, ) # 5. set up dataset format and input/output pipeline of dataset tokenized_datasets.set_format(type='torch', columns=_EL_COLUMNS) # include components in args args.tokenized_datasets = tokenized_datasets args.num_labels = num_labels args.tokenizer = tokenizer args.data_collator = data_collator # load entity embedding and set up shape parameters args.EntityEmbedding = torch.load(args.ent_vecs_filename, map_location='cpu') args.num_entity_labels = args.EntityEmbedding.shape[0] args.dim_entity_emb = args.EntityEmbedding.shape[1] return cls(args)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ # **YD** add BertTokenizerFast to be suitable for CONLL2003 NER task, pipeline is similar to # https://github.com/huggingface/transformers/tree/master/examples/token-classification # 1.obtain tokenizer and data_collator tokenizer = BertTokenizerFast(args.dict) data_collator = YD_DataCollatorForTokenClassification( tokenizer, max_length=args.max_pred_length, padding=True) # 2. process datasets, (tokenization of NER data) # **YD**, add args in option.py for fine-tuning task data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file if args.test_file is not None: data_files["test"] = args.test_file extension = args.extension_file dataset = datasets.load_dataset(extension, data_files=data_files) # 3. setup num_labels if 'train' in dataset: column_names = dataset["train"].column_names features = dataset["train"].features elif 'validation' in dataset: column_names = dataset["validation"].column_names features = dataset["validation"].features elif 'test' in dataset: column_names = dataset["test"].column_names features = dataset["test"].features else: raise ValueError( 'dataset must contain "train"/"validation"/"test"') text_column_name = "tokens" if "tokens" in column_names else column_names[ 0] label_column_name = ('ner_tags' if 'ner_tags' in column_names else column_names[1]) if isinstance(features[label_column_name].feature, ClassLabel): label_list = features[label_column_name].feature.names # No need to convert the labels since they are already ints. label_to_id = {i: i for i in range(len(label_list))} else: if 'train' in label_column_name: label_list = get_label_list( datasets["train"][label_column_name]) elif 'validation' in label_column_name: label_list = get_label_list( datasets["validation"][label_column_name]) elif 'test' in label_column_name: label_list = get_label_list( datasets["test"][label_column_name]) else: raise ValueError( 'dataset must contain "train"/"validation"/"test"') label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) # 4. tokenization # Tokenize all texts and align the labels with them. def tokenize_and_align_labels(examples, label_all_tokens=False): tokenized_inputs = tokenizer( examples[text_column_name], padding=False, truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, return_offsets_mapping=True, ) offset_mappings = tokenized_inputs.pop("offset_mapping") labels = [] for label, offset_mapping in zip(examples[label_column_name], offset_mappings): label_index = 0 current_label = -100 label_ids = [] for offset in offset_mapping: # We set the label for the first token of each word. Special characters will have an offset of (0, 0) # so the test ignores them. if offset[0] == 0 and offset[1] != 0: current_label = label_to_id[label[label_index]] label_index += 1 label_ids.append(current_label) # For special tokens, we set the label to -100 so it's automatically ignored in the loss function. elif offset[0] == 0 and offset[1] == 0: label_ids.append(-100) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append( current_label if label_all_tokens else -100) labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs tokenized_datasets = dataset.map( tokenize_and_align_labels, batched=True, num_proc=args.num_workers, load_from_cache_file=False, ) # 5. set up dataset format and input/output pipeline of dataset tokenized_datasets.set_format(type='torch', columns=_NER_COLUMNS) # include components in args args.tokenized_datasets = tokenized_datasets args.num_labels = num_labels args.tokenizer = tokenizer args.data_collator = data_collator return cls(args)
def main(): args = set_args() logger = create_logger(args) # 当用户使用GPU,并且GPU可用时 args.cuda = torch.cuda.is_available() and not args.no_cuda device = 'cuda' if args.cuda else 'cpu' logger.info('using device:{}'.format(device)) os.environ["CUDA_VISIBLE_DEVICES"] = args.device tokenizer = BertTokenizerFast(vocab_file=args.vocab_path, sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]") # tokenizer = BertTokenizer(vocab_file=args.voca_path) model = GPT2LMHeadModel.from_pretrained(args.model_path) model = model.to(device) model.eval() if args.save_samples_path: if not os.path.exists(args.save_samples_path): os.makedirs(args.save_samples_path) samples_file = open(args.save_samples_path + '/samples.txt', 'a', encoding='utf8') samples_file.write("聊天记录{}:\n".format(datetime.now())) # 存储聊天记录,每个utterance以token的id的形式进行存储 history = [] print('开始和chatbot聊天,输入CTRL + Z以退出') while True: try: text = input("user:"******"你好" if args.save_samples_path: samples_file.write("user:{}\n".format(text)) text_ids = tokenizer.encode(text, add_special_tokens=False) history.append(text_ids) input_ids = [tokenizer.cls_token_id] # 每个input以[CLS]为开头 for history_id, history_utr in enumerate( history[-args.max_history_len:]): input_ids.extend(history_utr) input_ids.append(tokenizer.sep_token_id) input_ids = torch.tensor(input_ids).long().to(device) input_ids = input_ids.unsqueeze(0) response = [] # 根据context,生成的response # 最多生成max_len个token for _ in range(args.max_len): outputs = model(input_ids=input_ids) logits = outputs.logits next_token_logits = logits[0, -1, :] # 对于已生成的结果generated中的每个token添加一个重复惩罚项,降低其生成概率 for id in set(response): next_token_logits[id] /= args.repetition_penalty next_token_logits = next_token_logits / args.temperature # 对于[UNK]的概率设为无穷小,也就是说模型的预测结果不可能是[UNK]这个token next_token_logits[tokenizer.convert_tokens_to_ids( '[UNK]')] = -float('Inf') filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=args.topk, top_p=args.topp) # torch.multinomial表示从候选集合中无放回地进行抽取num_samples个元素,权重越高,抽到的几率越高,返回元素的下标 next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1) if next_token == tokenizer.sep_token_id: # 遇到[SEP]则表明response生成结束 break response.append(next_token.item()) input_ids = torch.cat((input_ids, next_token.unsqueeze(0)), dim=1) # his_text = tokenizer.convert_ids_to_tokens(curr_input_tensor.tolist()) # print("his_text:{}".format(his_text)) history.append(response) text = tokenizer.convert_ids_to_tokens(response) print("chatbot:" + "".join(text)) if args.save_samples_path: samples_file.write("chatbot:{}\n".format("".join(text))) except KeyboardInterrupt: if args.save_samples_path: samples_file.close() break
vocab_size=task_id_vocab_size, hidden_size=128, num_hidden_layers=2, num_attention_heads=2, max_position_embeddings=512, type_vocab_size=1, ) # tokenizer._tokenizer.post_processor = BertProcessing( # ("</s>", tokenizer.token_to_id("</s>")), # ("<s>", tokenizer.token_to_id("<s>")), # ) # tokenizer.enable_truncation(max_length=512) # tokenizer = BertTokenizerFast.from_pretrained("./tmp", max_len=512) # uid_task_id_sequence_path = 'data/feature_sequence/uid_task_id.txt' tokenizer = BertTokenizerFast('data/bert_and_tokenizer/uid_task_id-vocab.txt') model = BertForMaskedLM(config=config) dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=uid_task_id_sequence_path, block_size=512, # 序列最大长度 ) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) training_args = TrainingArguments( output_dir="./tmp",
torch.tensor(all_attention_mask, dtype=torch.long), torch.tensor(all_token_type_ids, dtype=torch.long), torch.tensor(all_label_ids, dtype=torch.long)) sampler = RandomSampler(dataset) dataloader = DataLoader(dataset, batch_size=config.batch_size, sampler=sampler) return dataloader @staticmethod def load_dataloader(tokenizer, file_path, verbose=False): samples = DataProcessor._read_file(file_path) dataloader = DataProcessor._build_dataloader(samples, tokenizer, verbose) return dataloader if __name__ == '__main__': print( max([ len(x.sent) for x in DataProcessor._read_file('pku_training.utf8') ])) # 1019 from transformers import BertTokenizerFast tokenizer = BertTokenizerFast(config.bert_tokenizer_path) dataloader = DataProcessor.load_dataloader(tokenizer, 'pku_training.utf8', verbose=True) for batch in dataloader: a, b, c, d = batch print(a, b, c, d) break
import io import argparse import torch from transformers import BertTokenizerFast, BertForTokenClassification from flask import Flask, jsonify, request from server.utils import preprocess_data, predict, idx2tag app = Flask(__name__) app.config['JSON_SORT_KEYS'] = False MAX_LEN = 500 NUM_LABELS = 12 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") MODEL_PATH = 'bert-base-uncased' STATE_DICT = torch.load("model-state.bin", map_location=DEVICE) TOKENIZER = BertTokenizerFast("./vocab/vocab.txt", lowercase=True) model = BertForTokenClassification.from_pretrained( 'bert-base-uncased', state_dict=STATE_DICT['model_state_dict'], num_labels=NUM_LABELS) model.to(DEVICE) @app.route('/predict', methods=['POST']) def predict_api(): if request.method == 'POST': data = io.BytesIO(request.files.get('resume').read()) resume_text = preprocess_data(data) entities = predict(model, TOKENIZER, idx2tag, DEVICE, resume_text, MAX_LEN)
def create_trelm_roberta_model(pretrained_model_path, vocab_path, do_lower_case, vocab_emb_path, vocab_emb_type, save_model_to, langid_list): tokenizer = BertTokenizerFast(vocab_path, do_lower_case=do_lower_case) vocab_emb_weights = None if vocab_emb_type == 'pth': vocab_emb_data = torch.load(vocab_emb_path) vocab_emb_weights = vocab_emb_data['vectors'] assert tokenizer.vocab_size == vocab_emb_weights.size(0) elif vocab_emb_type == 'word2vec': wv_model = KeyedVectors.load_word2vec_format(vocab_emb_path) vocab_emb_weights = torch.FloatTensor(wv_model.vectors) assert tokenizer.vocab_size == vocab_emb_weights.size(0) model = TrelmRobertaForMaskedLM.from_pretrained(pretrained_model_path) if vocab_emb_weights is not None: assert model.config.hidden_size == vocab_emb_weights.size(1) # set the hyperparameters model.config.vocab_size = tokenizer.vocab_size model.config.pad_token_id = tokenizer.pad_token_id # model.config.bos_token_id = tokenizer.bos_token_id # model.config.eos_token_id = tokenizer.eos_token_id model.config.max_position_embeddings = model.config.max_position_embeddings - 1 # model.config.model_type = 'trelm_roberta' model.config.architectures = ['TrelmRobertaForMaskedLM'] model.config.type_vocab_size = 2 model.config.n_langs = 2 model.config.langs_to_id = { langid: idx for idx, langid in enumerate(langid_list) } # initial the word embeddings model.trelm_roberta.embeddings.word_embeddings = nn.Embedding( tokenizer.vocab_size, model.config.hidden_size, padding_idx=model.config.pad_token_id) if vocab_emb_weights is not None: model.trelm_roberta.embeddings.word_embeddings.weight.data.copy_( vocab_emb_weights) else: logger.info('word_embeddings random initialized!') model.trelm_roberta.embeddings.word_embeddings.weight.data.normal_( mean=0.0, std=model.config.initializer_range) # reset lm_head delattr(model, "lm_head") # initial the position embeddings old_position_emb_weight = model.trelm_roberta.embeddings.position_embeddings.weight.data model.trelm_roberta.embeddings.position_embeddings = nn.Embedding( model.config.max_position_embeddings, model.config.hidden_size, padding_idx=model.config.pad_token_id) model.trelm_roberta.embeddings.position_embeddings.weight.data.copy_( old_position_emb_weight[1:]) model.trelm_roberta.embeddings.position_ids = torch.arange( model.config.max_position_embeddings).expand((1, -1)) # initial lang embeddings? # initial type embeddings new_token_type_embeddings = model.trelm_roberta.embeddings.token_type_embeddings.weight.new_empty( model.config.type_vocab_size, model.config.hidden_size) new_token_type_embeddings[ 0, :] = model.trelm_roberta.embeddings.token_type_embeddings.weight model.trelm_roberta.embeddings.token_type_embeddings.weight.data = new_token_type_embeddings # initial the translation layer layer = model.trelm_roberta.encoder.layer[int( model.config.num_hidden_layers / 2)] model.trelm_roberta.encoder.tlayer.attention.self.query.weight = layer.attention.self.query.weight model.trelm_roberta.encoder.tlayer.attention.self.query.bias = layer.attention.self.query.bias model.trelm_roberta.encoder.tlayer.attention.self.key.weight = layer.attention.self.key.weight model.trelm_roberta.encoder.tlayer.attention.self.key.bias = layer.attention.self.key.bias model.trelm_roberta.encoder.tlayer.attention.self.value.weight = layer.attention.self.value.weight model.trelm_roberta.encoder.tlayer.attention.self.value.bias = layer.attention.self.value.bias model.trelm_roberta.encoder.tlayer.attention.output.dense.weight = layer.attention.output.dense.weight model.trelm_roberta.encoder.tlayer.attention.output.dense.bias = layer.attention.output.dense.bias model.trelm_roberta.encoder.tlayer.attention.output.LayerNorm.weight = layer.attention.output.LayerNorm.weight model.trelm_roberta.encoder.tlayer.attention.output.LayerNorm.bias = layer.attention.output.LayerNorm.bias model.trelm_roberta.encoder.tlayer.intermediate.dense.weight = layer.intermediate.dense.weight model.trelm_roberta.encoder.tlayer.intermediate.dense.bias = layer.intermediate.dense.bias model.trelm_roberta.encoder.tlayer.output.dense.weight = layer.output.dense.weight model.trelm_roberta.encoder.tlayer.output.dense.bias = layer.output.dense.bias model.trelm_roberta.encoder.tlayer.output.LayerNorm.weight = layer.output.LayerNorm.weight model.trelm_roberta.encoder.tlayer.output.LayerNorm.bias = layer.output.LayerNorm.bias logger.info(f'saving model to {save_model_to}') model.save_pretrained(save_model_to) tokenizer.save_pretrained(save_model_to)
encoder outputs from Encoder, in shape (T,B,H) :param src_len: used for masking. NoneType or tensor in shape (B) indicating sequence length :return attention energies in shape (B,T) ''' att = self.attn(x) att = F.tanh(att) att = F.softmax(att, 1) att_x = att * x return att_x.sum(1) # In[6]: tokenizer = BertTokenizerFast('../model_weight/nezha/vocab.txt') test_set = CustomDataset(test, maxlen=128, tokenizer=tokenizer, with_labels=False) test_loader = Data.DataLoader(test_set, batch_size=batch_size, num_workers=5, shuffle=False) train_set = CustomDataset(train, maxlen=128, tokenizer=tokenizer) train_loader = Data.DataLoader(train_set, batch_size=batch_size, num_workers=5, shuffle=True)