def preprocess_raw_example( rawe: RawExample, tokenizer: BertTokenizer, cond_tokenizer: spm.SentencePieceProcessor, ) -> Tuple[str, Example]: e = Example( title_token_ids=tokenizer.encode(rawe.title, add_special_tokens=False), description_token_ids=tokenizer.encode(rawe.description, add_special_tokens=False), condition_token_ids=cond_tokenizer.EncodeAsIds(rawe.condition), fact_token_ids=tokenizer.encode(rawe.fact, add_special_tokens=False), description=rawe.description, ) return hashlib.sha1(json.dumps(e.__dict__).encode()).hexdigest(), e
def predict(session: InferenceSession, tokenizer: BertTokenizer, text): tokens = tokenizer(text, return_attention_mask=True, return_tensors="pt") inputs_onnx = {k: np.atleast_2d(v) for k, v in tokens.items()} entities = session.run(None, inputs_onnx)[0].squeeze(0) input_ids = tokens["input_ids"][0] score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True) labels_idx = score.argmax(axis=-1) entities = [] # Filter to labels not in `self.ignore_labels` filtered_labels_idx = [ (idx, label_idx) for idx, label_idx in enumerate(labels_idx) if config['id2label'][str(label_idx)] not in IGNORE_LABELS ] for idx, label_idx in filtered_labels_idx: entity = { "word": tokenizer.convert_ids_to_tokens(int(input_ids[idx])), "score": score[idx][label_idx].item(), "entity": config['id2label'][str(label_idx)], "index": idx } entities += [entity] answers = [] answers += [group_entities(entities, tokenizer)] answers = answers[0] if len(answers) == 1 else answers return render_ner_html_custom(text, answers, colors=colors)
def __init__( self, token_indexers: Dict[str, TokenIndexer] = None, domain_identifier: str = None, bert_model_name: str = None, **kwargs, ) -> None: super().__init__(**kwargs) if token_indexers is not None: self._token_indexers = token_indexers elif bert_model_name is not None: from allennlp.data.token_indexers import PretrainedTransformerIndexer self._token_indexers = { "tokens": PretrainedTransformerIndexer(bert_model_name) } else: self._token_indexers = {"tokens": SingleIdTokenIndexer()} self._domain_identifier = domain_identifier if bert_model_name is not None: self.bert_tokenizer = BertTokenizer.from_pretrained( bert_model_name) self.lowercase_input = "uncased" in bert_model_name else: self.bert_tokenizer = None self.lowercase_input = False
def __init__(self, args): super(KobeModel, self).__init__() self.encoder = Encoder( vocab_size=args.text_vocab_size + args.cond_vocab_size, max_seq_len=args.max_seq_len, d_model=args.d_model, nhead=args.nhead, num_layers=args.num_encoder_layers, dropout=args.dropout, mode=args.mode, ) self.decoder = Decoder( vocab_size=args.text_vocab_size, max_seq_len=args.max_seq_len, d_model=args.d_model, nhead=args.nhead, num_layers=args.num_decoder_layers, dropout=args.dropout, ) self.lr = args.lr self.d_model = args.d_model self.loss = nn.CrossEntropyLoss(reduction="mean", ignore_index=0, label_smoothing=0.1) self._reset_parameters() self.decoding_strategy = args.decoding_strategy self.vocab = BertTokenizer.from_pretrained(args.text_vocab_path) self.bleu = BLEU(tokenize=args.tokenize) self.sacre_tokenizer = _get_tokenizer(args.tokenize)() self.bert_scorer = BERTScorer(lang=args.tokenize, rescale_with_baseline=True)
def setUp(self): super().setUp() self.tokenizers = [BertTokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS] self.tf_tokenizers = [TFBertTokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS] self.test_sentences = [ "This is a straightforward English test sentence.", "This one has some weird characters\rto\nsee\r\nif those\u00E9break things.", "Now we're going to add some Chinese: 一 二 三 一二三", "And some much more rare Chinese: 齉 堃 齉堃", "Je vais aussi écrire en français pour tester les accents", "Classical Irish also has some unusual characters, so in they go: Gaelaċ, ꝼ", ] self.paired_sentences = list(zip(self.test_sentences, self.test_sentences[::-1]))
def main(args): # For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm) # If we want to fine-tune these model, we have to use same tokenizer : LTP (https://github.com/HIT-SCIR/ltp) with open(args.file_name, "r", encoding="utf-8") as f: data = f.readlines() data = [ line.strip() for line in data if len(line) > 0 and not line.isspace() ] # avoid delimiter like '\u2029' ltp_tokenizer = LTP(args.ltp) # faster in GPU device bert_tokenizer = BertTokenizer.from_pretrained(args.bert) ref_ids = prepare_ref(data, ltp_tokenizer, bert_tokenizer) with open(args.save_path, "w", encoding="utf-8") as f: data = [json.dumps(ref) + "\n" for ref in ref_ids] f.writelines(data)
def setup_method(self): self.monkeypatch = MonkeyPatch() # monkeypatch the PretrainedBertModel to return the tiny test fixture model config_path = FIXTURES_ROOT / "structured_prediction" / "srl" / "bert" / "config.json" vocab_path = FIXTURES_ROOT / "structured_prediction" / "srl" / "bert" / "vocab.txt" config = BertConfig.from_json_file(config_path) self.monkeypatch.setattr(BertModel, "from_pretrained", lambda _: BertModel(config)) self.monkeypatch.setattr( BertTokenizer, "from_pretrained", lambda _: BertTokenizer(vocab_path) ) super().setup_method() self.set_up_model( FIXTURES_ROOT / "structured_prediction" / "srl" / "bert_srl.jsonnet", FIXTURES_ROOT / "structured_prediction" / "srl" / "conll_2012", )
def prepare_ref(lines: List[str], ltp_tokenizer: LTP, bert_tokenizer: BertTokenizer): ltp_res = [] for i in range(0, len(lines), 100): res = ltp_tokenizer.seg(lines[i:i + 100])[0] res = [get_chinese_word(r) for r in res] ltp_res.extend(res) assert len(ltp_res) == len(lines) bert_res = [] for i in range(0, len(lines), 100): res = bert_tokenizer(lines[i:i + 100], add_special_tokens=True, truncation=True, max_length=512) bert_res.extend(res["input_ids"]) assert len(bert_res) == len(lines) ref_ids = [] for input_ids, chinese_word in zip(bert_res, ltp_res): input_tokens = [] for id in input_ids: token = bert_tokenizer._convert_id_to_token(id) input_tokens.append(token) input_tokens = add_sub_symbol(input_tokens, chinese_word) ref_id = [] # We only save pos of chinese subwords start with ##, which mean is part of a whole word. for i, token in enumerate(input_tokens): if token[:2] == "##": clean_token = token[2:] # save chinese tokens' pos if len(clean_token) == 1 and _is_chinese_char( ord(clean_token)): ref_id.append(i) ref_ids.append(ref_id) assert len(ref_ids) == len(bert_res) return ref_ids
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() processors = { "rte": RteProcessor } output_modes = { "rte": "classification" } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] threeway_train_examples, threeway_dev_examples = processor.get_MNLI_train_and_dev('/export/home/Dataset/glue_data/MNLI/train.tsv', ['/export/home/Dataset/glue_data/MNLI/dev_mismatched.tsv', '/export/home/Dataset/glue_data/MNLI/dev_matched.tsv']) '''preprocessing: binary classification, randomly sample 20k for testing data''' train_examples = [] for ex in threeway_train_examples: if ex.label == 'neutral' or ex.label == 'contradiction': ex.label = 'neutral' train_examples.append(ex) # train_examples = train_examples[:100] dev_examples = [] for ex in threeway_dev_examples: if ex.label == 'neutral' or ex.label == 'contradiction': ex.label = 'neutral' dev_examples.append(ex) random.shuffle(dev_examples) test_examples = dev_examples[:13000] dev_examples = dev_examples[13000:] label_list = ["entailment", "neutral"]#, "contradiction"] num_labels = len(label_list) print('num_labels:', num_labels, 'training size:', len(train_examples), 'dev size:', len(dev_examples), ' test size:', len(test_examples)) num_train_optimization_steps = None num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() model = BertForSequenceClassification(num_labels) tokenizer = BertTokenizer.from_pretrained(pretrain_model_dir, do_lower_case=args.do_lower_case) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) global_step = 0 nb_tr_steps = 0 tr_loss = 0 max_test_acc = 0.0 max_dev_acc = 0.0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=False,#bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=0,#2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra=True,#bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=False,#bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=0)#4 if args.model_type in ['xlnet'] else 0,) '''load dev set''' dev_features = convert_examples_to_features( dev_examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=False,#bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=0,#2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra=True,#bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=False,#bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=0)#4 if args.model_type in ['xlnet'] else 0,) dev_all_input_ids = torch.tensor([f.input_ids for f in dev_features], dtype=torch.long) dev_all_input_mask = torch.tensor([f.input_mask for f in dev_features], dtype=torch.long) dev_all_segment_ids = torch.tensor([f.segment_ids for f in dev_features], dtype=torch.long) dev_all_label_ids = torch.tensor([f.label_id for f in dev_features], dtype=torch.long) dev_data = TensorDataset(dev_all_input_ids, dev_all_input_mask, dev_all_segment_ids, dev_all_label_ids) dev_sampler = SequentialSampler(dev_data) dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=args.eval_batch_size) '''load test set''' test_features = convert_examples_to_features( test_examples, label_list, args.max_seq_length, tokenizer, output_mode, cls_token_at_end=False,#bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=0,#2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, sep_token_extra=True,#bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=False,#bool(args.model_type in ['xlnet']), # pad on the left for xlnet pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=0)#4 if args.model_type in ['xlnet'] else 0,) test_all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) test_all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) test_all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) test_all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) test_data = TensorDataset(test_all_input_ids, test_all_input_mask, test_all_segment_ids, test_all_label_ids) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) iter_co = 0 final_test_performance = 0.0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, input_mask) loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 optimizer.step() optimizer.zero_grad() global_step += 1 iter_co+=1 ''' start evaluate on dev set after this epoch ''' model.eval() # eval_loss = 0 # nb_eval_steps = 0 # preds = [] # gold_label_ids = [] # # print('Evaluating...') # for input_ids, input_mask, segment_ids, label_ids in dev_dataloader: # input_ids = input_ids.to(device) # input_mask = input_mask.to(device) # segment_ids = segment_ids.to(device) # label_ids = label_ids.to(device) # gold_label_ids+=list(label_ids.detach().cpu().numpy()) # # with torch.no_grad(): # logits = model(input_ids, input_mask) # if len(preds) == 0: # preds.append(logits.detach().cpu().numpy()) # else: # preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) # # preds = preds[0] # # pred_probs = softmax(preds,axis=1) # pred_label_ids = list(np.argmax(pred_probs, axis=1)) # # gold_label_ids = gold_label_ids # assert len(pred_label_ids) == len(gold_label_ids) # hit_co = 0 # for k in range(len(pred_label_ids)): # if pred_label_ids[k] == gold_label_ids[k]: # hit_co +=1 # test_acc = hit_co/len(gold_label_ids) dev_acc = evaluation(dev_dataloader, device, model) if dev_acc > max_dev_acc: max_dev_acc = dev_acc print('\ndev acc:', dev_acc, ' max_dev_acc:', max_dev_acc, '\n') '''evaluate on the test set with the best dev model''' final_test_performance = evaluation(test_dataloader, device, model) print('\ntest acc:', final_test_performance, '\n') else: print('\ndev acc:', dev_acc, ' max_dev_acc:', max_dev_acc, '\n') print('final_test_performance:', final_test_performance)
def get_tokenizer(self, **kwargs): return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
default="icod-icod", help="Model variant to run.") parser.add_argument( "--dataset", type=str, default="/data/nv419/VQG_DATA/processed/iq_dataset.hdf5") parser.add_argument( "--val_dataset", type=str, default="/data/nv419/VQG_DATA/processed/iq_val_dataset.hdf5") args = parser.parse_args() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') args.device = device tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", special_tokens=[("[CLS]", 1), ("[SEP]", 2)], ) data_loader = get_loader(os.path.join(os.getcwd(), args.dataset), tokenizer, args.batch_size, shuffle=True, num_workers=8) val_data_loader = get_loader(os.path.join(os.getcwd(), args.val_dataset), tokenizer, args.batch_size, shuffle=False, num_workers=8)
def get_bert_vocab_size(vocab_path: str) -> int: tokenizer = BertTokenizer.from_pretrained(vocab_path) return tokenizer.vocab_size
import tempfile from argparse import ArgumentParser import sentencepiece as spm from transformers.models.bert.tokenization_bert import BertTokenizer # Load the text tokenizer tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") BOS_TOKEN = tokenizer.cls_token EOS_TOKEN = tokenizer.sep_token UNK_TOKEN = tokenizer.unk_token PAD_ID = tokenizer.pad_token_id BOS_ID = tokenizer.cls_token_id EOS_ID = tokenizer.sep_token_id UNK_ID = tokenizer.unk_token_id # Build the condition (attribute) tokenizer if __name__ == "__main__": parser = ArgumentParser() # fmt: off parser.add_argument("--input", nargs="+", required=True) parser.add_argument("--vocab-file", type=str, required=True) parser.add_argument("--vocab-size", type=int, default=31) parser.add_argument("--algo", type=str, default="bpe", choices=["bpe", "word"]) # fmt: on args = parser.parse_args() print("Building token vocabulary")
def tokenizer(): return BertTokenizer.from_pretrained('bert-base-cased')
def main(): parser = argparse.ArgumentParser(description='seq2seq') parser.add_argument( '--model', default='seq2seq', type=str, help= 'which model you are going to train, now including [seq2seq, pmi_seq2seq]' ) parser.add_argument( '--attn', default=None, type=str, help='which attention method to use, including [dot, general, concat]') parser.add_argument('--gpu', default=-1, type=int, help='which GPU to use, -1 means using CPU') parser.add_argument('--save', action="store_true", help='whether to save model or not') parser.add_argument('--bs', default=64, type=int, help='batch size') parser.add_argument('--emb_dim', default=300, type=int, help='embedding dim') parser.add_argument('--enc_hid_dim', default=300, type=int, help='hidden dim of lstm') parser.add_argument('--dec_hid_dim', default=300, type=int, help='hidden dim of lstm') parser.add_argument('--birnn', action='store_true', help='whether to use bidirectional rnn, default False') parser.add_argument('--n_layers', default=1, type=int, help='layer num of encoder and decoder') parser.add_argument('--dropout', default=0.5, type=float, help='dropout ratio') parser.add_argument('--n_epochs', default=30, type=int, help='num of train epoch') parser.add_argument('--min_freq', default=1, type=int, help='minimal occur times for vocabulary') parser.add_argument('--clip', default=None, type=float, help='grad clip') parser.add_argument('--maxlen', default=None, type=int, help='max length of text') parser.add_argument('--dataset_dir_path', default=None, type=str, help='path to directory where data file is saved') parser.add_argument('--tokenizer', default='spacy_en', type=str, help='which tokenizer to use for the dataset') parser.add_argument('--train_file', default=None, type=str, help='train file name') parser.add_argument('--valid_file', default=None, type=str, help='valid file name') parser.add_argument('--test_file', default=None, type=str, help='test file name') parser.add_argument('--save_dir', default='models', type=str, help='save dir') parser.add_argument('--vocab_file', default=None, type=str, help='predefined vocab file') parser.add_argument( '--num_workers', default=0, type=int, help= 'how many subprocesses to use for data loading. 0 means that the data will be loaded in the main process.' ) parser.add_argument('--l2', default=0, type=float, help='l2 regularization') parser.add_argument('--lr', default=1e-4, type=float, help='learning rate') parser.add_argument( '--teaching_rate', default=1, type=float, help='teaching_rate is probability to use teacher forcing') parser.add_argument('--pretrained_embed_file', default=None, type=str, help='torchtext vector name') parser.add_argument('--warmup', default=0, type=int, help='warmup steps, 0 means not using NoamOpt') parser.add_argument('--cell_type', default='LSTM', type=str, help='cell type of encoder/decoder, LSTM or GRU') parser.add_argument( '--comment', default='', type=str, help='comment, will be used as prefix of save directory') parser.add_argument('--smoothing', default=0.0, type=float, help='smoothing rate of computing kl div loss') parser.add_argument('--max_vocab_size', default=None, type=int, help='max size of vocab') parser.add_argument('--serialize', action='store_true', help='whether to serialize examples and vocab') parser.add_argument('--use_serialized', action='store_true', help='whether to use serialized dataset') parser.add_argument('--model_path', default=None, type=str, help='restore model to continue training') parser.add_argument('--global_step', default=0, type=int, help='global step for continuing training') parser.add_argument('--inference', action='store_true', help='inference mode') parser.add_argument('--seed', default=20020206, type=int, help='random seed') parser.add_argument( '--ln', action='store_true', help= 'whether to use layernorm, if model is pmi_seq2seq, use conditional layernorm as default' ) parser.add_argument( '--patience', default=None, type=int, help= "stop when {patience} continued epochs giving no improved performance" ) args, unparsed = parser.parse_known_args() setup_random_seed(args.seed) writer = None if args.save: tz_sh = tz.gettz('Asia/Shanghai') save_dir = os.path.join( args.save_dir, args.comment + 'run' + str(datetime.now(tz=tz_sh)).replace( ":", "-").split(".")[0].replace(" ", '.')) if args.model_path: save_dir = os.path.split(args.model_path)[0] args.save_dir = save_dir if not os.path.exists(save_dir): os.mkdir(save_dir) with open(os.path.join(save_dir, 'args.txt'), 'w') as f: json.dump(args.__dict__, f, indent=2) writer = SummaryWriter(os.path.join(save_dir, 'summary')) device = torch.device(args.gpu if ( torch.cuda.is_available() and args.gpu >= 0) else 'cpu') args.device = device if args.tokenizer == 'spacy_en': dataset = seq2seq_dataset(args) elif args.tokenizer == 'jieba': from data.dataset import jieba_tokenize dataset = seq2seq_dataset(args, tokenizer=jieba_tokenize) elif args.tokenizer == 'bert': tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') dataset = seq2seq_dataset(args, tokenizer=tokenizer.tokenize) elif args.tokenizer == 'whitespace': from data.dataset import whitespace_tokenize dataset = seq2seq_dataset(args, tokenizer=whitespace_tokenize) # dataset = load_iwslt(args) SRC = dataset['fields']['src'] TGT = dataset['fields']['tgt'] EMB_DIM = args.emb_dim ENC_HID_DIM = args.enc_hid_dim DEC_HID_DIM = args.dec_hid_dim N_LAYERS = args.n_layers ENC_DROPOUT = args.dropout DEC_DROPOUT = args.dropout SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token] TGT_PAD_IDX = TGT.vocab.stoi[TGT.pad_token] N_EPOCHS = args.n_epochs CLIP = args.clip src_embedding = Embedding(len(SRC.vocab), EMB_DIM, padding_idx=SRC_PAD_IDX, dropout=ENC_DROPOUT) tgt_embedding = Embedding(len(TGT.vocab), EMB_DIM, padding_idx=TGT_PAD_IDX, dropout=DEC_DROPOUT) if args.pretrained_embed_file: # 权重在词汇表vocab的vectors属性中 src_pretrained_vectors = SRC.vocab.vectors tgt_pretrained_vectors = TGT.vocab.vectors # 指定嵌入矩阵的初始权重 src_embedding.lut.weight.data.copy_(src_pretrained_vectors) tgt_embedding.lut.weight.data.copy_(tgt_pretrained_vectors) print("pretrained vectors loaded successfully!") enc = RNNBaseEncoder(args.cell_type, EMB_DIM, ENC_HID_DIM, N_LAYERS, bidirectional=args.birnn, dropout=ENC_DROPOUT, layernorm=args.ln) if args.attn is not None: dec = LuongAttnRNNDecoder(args.cell_type, EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, attn_method=args.attn, num_layers=N_LAYERS, dropout=DEC_DROPOUT) else: dec = RNNBaseDecoder(args.cell_type, EMB_DIM, DEC_HID_DIM, num_layers=N_LAYERS, dropout=DEC_DROPOUT) generator = Generator(DEC_HID_DIM, len(TGT.vocab)) if args.ln: if args.model == 'seq2seq': layernorm = LayerNorm(feature=ENC_HID_DIM) elif args.model == 'pmi_seq2seq': layernorm = LayerNorm(feature=DEC_HID_DIM, conditional=True, condition_size=len(TGT.vocab), condition_hidden_size=DEC_HID_DIM, condition_activation="ReLU") else: raise ValueError(args.model, "is not a legal model name!") else: layernorm = None if args.model == 'seq2seq': model = RNNBaseSeq2Seq(enc, dec, src_embedding, tgt_embedding, generator).to(device) train_pmi = None elif args.model == 'pmi_seq2seq': # 默认pmi_hid_dim = ENC_HID_DIM, 因此dec_hid_dim必须是enc_hid_dim的两倍! model = RNNBasePMISeq2Seq(ENC_HID_DIM, enc, dec, src_embedding, tgt_embedding, generator, layernorm).to(device) from scipy import sparse train_pmi = sparse.load_npz( os.path.join(args.dataset_dir_path, "train_sparse_pmi_matrix.npz")) # valid_pmi = sparse.load_npz(os.path.join(args.dataset_dir_path, "valid_sparse_pmi_matrix.npz")) # 好像用不上valid和test pmi,不然算标签泄漏了? # test_pmi = sparse.load_npz(os.path.join(args.dataset_dir_path, "test_sparse_pmi_matrix.npz")) if args.model_path is not None: logger.info(f"Restore model from {args.model_path}...") # model.load_state_dict(torch.load(args.model_path, map_location={'cuda:0': 'cuda:' + str(args.gpu)})) model = torch.load(args.model_path, map_location={'cuda:0': 'cuda:' + str(args.gpu)}) model.to(args.device) print(model) weight = torch.ones(len(TGT.vocab), device=args.device) weight[TGT_PAD_IDX] = 0 criterion = nn.NLLLoss(reduction='sum', ignore_index=TGT_PAD_IDX, weight=weight) # criterion = LabelSmoothing(args, len(TGT.vocab), padding_idx=TGT_PAD_IDX, smoothing=args.smoothing) if args.inference: try: assert args.model_path is not None except AssertionError: logger.error( "If you want to do inference, you must offer a trained model's path!" ) finally: inference(args, model, dataset['valid_iterator'], fields=dataset['fields'], mode='valid', pmi=train_pmi) inference(args, model, dataset['test_iterator'], fields=dataset['fields'], mode='test', pmi=train_pmi) return 0 print(f'The model has {count_parameters(model):,} trainable parameters') optimizer = AdamOptimizer(model.parameters(), lr=args.lr, weight_decay=args.l2, max_grad_norm=args.clip) # optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2) if args.warmup > 0: optimizer = NoamOptimWrapper(args.hid_dim, 1, args.warmup, optimizer) if args.global_step > 0: logger.info(f'Global step start from {args.global_step}') optimizer._step = args.global_step # TODO 取消hard-code式保存最佳指标 best_global_step = 0 best_valid_loss = float('inf') best_test_loss = float('inf') global_step = optimizer._step patience = args.patience if args.patience else float('inf') no_improve = 0 for epoch in range(N_EPOCHS): start_time = time.time() train_metrics = train(args, model, dataset['train_iterator'], optimizer, criterion, fields=dataset['fields'], writer=writer, pmi=train_pmi) global_step += len(dataset['train_iterator']) args.global_step = global_step valid_metrics = evaluate(args, model, dataset['valid_iterator'], criterion, fields=dataset['fields'], pmi=train_pmi) test_metrics = evaluate(args, model, dataset['test_iterator'], criterion, fields=dataset['fields'], pmi=train_pmi) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) print( f'Epoch: {epoch + 1:02} | Global step: {global_step} | Time: {epoch_mins}m {epoch_secs}s' ) for metrics, mode in zip([train_metrics, valid_metrics, test_metrics], ['Train', 'Valid', 'Test']): print_metrics(metrics, mode=mode) # TODO 优化存储logfile,取消hard-code模式 if args.save: write_metrics_to_writer(valid_metrics, writer, global_step, mode='Valid') write_metrics_to_writer(test_metrics, writer, global_step, mode='Test') best_valid_loss = valid_metrics['epoch_loss'] if valid_metrics[ 'epoch_loss'] < best_valid_loss else best_valid_loss best_test_loss = test_metrics['epoch_loss'] if test_metrics[ 'epoch_loss'] < best_test_loss else best_test_loss best_global_step = global_step if valid_metrics[ 'epoch_loss'] == best_valid_loss else best_global_step if best_global_step == global_step: torch.save( model, os.path.join(save_dir, f'model_global_step-{global_step}.pt')) # torch.save(model.state_dict(), os.path.join(save_dir, f'model_global_step-{global_step}.pt')) no_improve = 0 else: no_improve += 1 with open( os.path.join(save_dir, f'log_global_step-{global_step}.txt'), 'w') as log_file: valid_metrics['Best Global Step'] = best_global_step valid_metrics['Best Loss'] = best_valid_loss test_metrics['Best Loss'] = best_test_loss test_metrics['Best PPL'] = math.exp(best_test_loss) inference(args, model, dataset['valid_iterator'], fields=dataset['fields'], mode='valid', pmi=train_pmi) inference(args, model, dataset['test_iterator'], fields=dataset['fields'], mode='test', pmi=train_pmi) valid_path_hyp = os.path.join(args.save_dir, 'responses-valid.txt') test_path_hyp = os.path.join(args.save_dir, 'responses-test.txt') valid_path_ref = os.path.join(args.save_dir, 'answers-valid.txt') test_path_ref = os.path.join(args.save_dir, 'answers-test.txt') other_valid_metrics = calc_metrics(path_refs=valid_path_ref, path_hyp=valid_path_hyp) other_test_metrics = calc_metrics(path_refs=test_path_ref, path_hyp=test_path_hyp) valid_metrics.update(other_valid_metrics) test_metrics.update(other_test_metrics) os.remove(os.path.join(args.save_dir, 'posts-valid.txt')) os.remove(os.path.join(args.save_dir, 'posts-test.txt')) os.remove(valid_path_hyp) os.remove(valid_path_ref) os.remove(test_path_hyp) os.remove(test_path_ref) for metric, performance in valid_metrics.items(): log_file.write(f'Valid {metric}: {performance}\n') for metric, performance in test_metrics.items(): log_file.write(f'Test {metric}: {performance}\n') if no_improve >= patience: break
if len(examples) > 10000: # save to shards for training data shard_size = (len(examples) + 7) // 8 for shard_id in range(8): write_to_tar( f"{output}-{shard_id}.tar", examples[shard_id * shard_size:(shard_id + 1) * shard_size], ) else: write_to_tar(f"{output}.tar", examples) if __name__ == "__main__": parser = ArgumentParser() add_options(parser) args = parser.parse_args() prepare_file(args) np.random.seed(42) text_tokenizer = BertTokenizer.from_pretrained(args.vocab_file) cond_tokenizer = spm.SentencePieceProcessor() cond_tokenizer.Load(args.cond_vocab_file) for split in args.split: preprocess_raw( input_prefix=os.path.join(args.raw_path, split), output=os.path.join(args.processed_path, f"{split}"), text_tokenizer=text_tokenizer, cond_tokenizer=cond_tokenizer, )