def __init__(self): self.device = torch.device("cuda:0") self.bertmodel, self.vocab = get_pytorch_kobert_model() bertmodel, vocab = get_pytorch_kobert_model() # 토큰화 tokenizer = get_tokenizer() self.tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) self.max_len = 64 self.batch_size = 64
def predict(model, text): device = torch.device("cuda:0") max_len = 64 batch_size = 64 warmup_ratio = 0.1 num_epochs = 2 max_grad_norm = 1 log_interval = 200 learning_rate = 5e-5 tokenizer = get_tokenizer() bertmodel, vocab = get_pytorch_kobert_model() tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) data_test = BERTDataset(text, 0, 1, tok, max_len, True, False) test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=0) model.eval() answer=[] for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)): token_ids = token_ids.long().to(device) segment_ids = segment_ids.long().to(device) valid_length= valid_length label = label.long().to(device) out = model(token_ids, valid_length, segment_ids) max_vals, max_indices = torch.max(out, 1) answer.append(max_indices.cpu().clone().numpy()) result = F.softmax(out) print(result) return result
def get_kobert_model_and_tokenizer(): tok_path = get_tokenizer() basic_tokenizer = SentencepieceTokenizer(tok_path) bert_base, vocab = get_pytorch_kobert_model() kobert_tokenizer = KoBertTokenizer(basic_tokenizer, vocab) return bert_base, kobert_tokenizer
def main(): nsmc_home_dir = 'NSMC_DIR' train_file = nsmc_home_dir + '/ratings_train.txt' # 150K test_file = nsmc_home_dir + '/ratings_test.txt' # 50K model, vocab = get_pytorch_kobert_model( ctx='cuda' if torch.cuda.is_available() else 'cpu') lr = 5e-5 batch_size = 16 epochs = 5 dropout_rate = 0.1 max_grad_norm = 1.0 num_total_steps = math.ceil(150000 / batch_size) * epochs num_warmup_steps = num_total_steps // 10 log_interval = 100 seed = 2019 num_workers = 2 num_classes = 2 pooler_out_dim = model.pooler.dense.out_features torch.manual_seed(seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print('device', device) tok_path = get_tokenizer() sp = SentencepieceTokenizer(tok_path) train_loader = torch.utils.data.DataLoader(MovieDataset( get_data(train_file, vocab, sp)), shuffle=True, batch_size=batch_size, num_workers=num_workers, collate_fn=batchify, pin_memory=True) test_loader = torch.utils.data.DataLoader(MovieDataset( get_data(test_file, vocab, sp)), batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=batchify, pin_memory=True) linear = torch.nn.Linear(pooler_out_dim, num_classes).to(device) all_params = list(model.parameters()) + list(linear.parameters()) optimizer = AdamW(all_params, lr=lr, correct_bias=False) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_total_steps) for epoch in range(epochs): train(train_loader, device, model, linear, all_params, optimizer, scheduler, dropout_rate, max_grad_norm, log_interval, epoch) print(datetime.now(), 'Testing...') test(test_loader, device, model, linear)
def load_model(file): device = torch.device("cuda:0") bertmodel, vocab = get_pytorch_kobert_model() model = BERTClassifier(bertmodel, dr_rate=0.5).to(device) model.load_state_dict(torch.load(file)) model.eval() return model
def submit(args): bert_model, vocab = get_pytorch_kobert_model() test_dataset = SentenceDataset(args.test_file, vocab, max_token_cnt=args.max_token_cnt) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, num_workers=args.num_workers, shuffle=False) model = ExtractiveModel(bert_model, 100, 11, 768, use_bert_sum_words=args.use_bert_sum_words, use_pos=args.use_pos, use_media=args.use_media, num_classes=2, simple_model=args.simple_model, dim_feedforward=args.dim_feedforward, dropout=args.dropout) if args.checkpoint_path is not None and os.path.isfile( args.checkpoint_path): state_dict = torch.load(args.checkpoint_path)[0] model.load_state_dict(state_dict) model.eval() # Set model to evaluate mode device = 'cuda' model.to(device) ids = [] summaries = [] for step, (token_ids_batch, pos_idx_batch, media_batch) in enumerate(test_loader): if step % 10 == 0: print(step, len(test_loader)) token_ids_batch = token_ids_batch[0].to(device) pos_idx_batch = pos_idx_batch[0].to(device) media_batch = media_batch[0].to(device) sentences, _, id = test_dataset.samples[step] ids.append(id) sentences = np.array(sentences) with torch.set_grad_enabled(False): outputs = model(token_ids_batch, pos_idx_batch, media_batch) indices = torch.argsort(outputs[:, 0], dim=0) sentences = sentences[indices[:3].cpu().numpy()] summaries.append("\n".join(sentences)) os.makedirs(args.output_dir, exist_ok=True) rows = zip(ids, summaries) with open(os.path.join(args.output_dir, "submission.csv"), "w+") as f: writer = csv.writer(f) writer.writerow(["id", "summary"]) for row in rows: writer.writerow(row) print("done")
def get_sentimentLabel(input_text, time_info): try: print("2. predict sentiment label") device = torch.device("cpu") bertmodel, vocab = get_pytorch_kobert_model() tokenizer = get_tokenizer() tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) model = BERTClassifier(bertmodel, dr_rate=0.5).to(device) weights = torch.load('weight/bert_weight.pth', map_location=torch.device('cpu')) model.load_state_dict(weights) model = model.to(device) model.eval() essay = pd.DataFrame(input_text) essay['label'] = 1 save_link = "Data/{}.txt".format(time_info) essay.to_csv(save_link, sep='\t', index_label='idx') dataset_sentences = nlp.data.TSVDataset(save_link, field_indices=[1, 2], num_discard_samples=1) data_sentences = BERTDataset(dataset_sentences, 0, 1, tok, 100, True, False) # max_len (100) sentences_dataloader = torch.utils.data.DataLoader( data_sentences, batch_size=len(data_sentences), num_workers=5) with torch.no_grad(): for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(sentences_dataloader): token_ids = token_ids.long().to(device) segment_ids = segment_ids.long().to(device) label = label.long().to(device) valid_length = valid_length outputs = model(token_ids, valid_length, segment_ids) pred_test = outputs arr = np.array(pred_test.tolist()) arr = ne.evaluate("exp(arr)") label_dic = dict([(0, 'anger'), (1, 'fear'), (2, 'happiness'), (3, 'miss'), (4, 'sadness'), (5, 'surprised'), (6, 'worry')]) for i in range(7): essay[label_dic[i]] = [proba[i] for proba in arr] essay['label'] = list(map(np.argmax, arr)) indices = np.array(list(map( np.max, arr))).argsort()[::-1][0:min(len(essay), 10)] prob = essay.iloc[indices].sum(axis=0)[2:].astype(float) prob['happiness'] *= 0.6 prob['fear'] *= 0.8 prob['worry'] *= 2 result = prob.idxmax() if result == 'fear': result = 'sadness' return result except: raise Sentiment_Error()
def __init__(self, temp_dir, load_pretrained_bert, bert_config): super(Bert, self).__init__() bertmodel, vocab = get_pytorch_kobert_model() if (load_pretrained_bert): # self.model = BertModel.from_pretrained('bert-base-uncased', cache_dir=temp_dir) self.model = bertmodel else: self.model = bertmodel
def load_model(self): self.bert_model, self.vocab = get_pytorch_kobert_model(ctx=self.device) self.model = BERTClassifier(self.bert_model, dr_rate=self.dropout_rt).to(self.device) self.model.load_state_dict(torch.load(self.save_path, map_location=self.device)) self.tokenizer = get_tokenizer() self.token = gluonnlp.data.BERTSPTokenizer(self.tokenizer, self.vocab, lower=False) self.line_converter = Converter(self.token, self.max_len, self.pad, self.pair, self.device)
def __init__(self, config, num_classes, vocab=None) -> None: super(KobertCRF, self).__init__() if vocab is None: self.bert, self.vocab = get_pytorch_kobert_model() else: self.bert = BertModel(config=BertConfig.from_dict(bert_config)) self.vocab = vocab self.dropout = nn.Dropout(config.dropout) self.position_wise_ff = nn.Linear(config.hidden_size, num_classes) self.crf = CRF(num_tags=num_classes, batch_first=True)
def load_model(self): self.bert_model, self.vocab = get_pytorch_kobert_model(ctx=self.device) self.model = BERTClassifier(self.bert_model, dr_rate=self.dropout_rt).to(self.device) if self.get_weights: print("get model from pretrained weigths") self.model.load_state_dict( torch.load(self.model_save_path, map_location=self.device)) self.tokenizer = get_tokenizer() self.token = gluonnlp.data.BERTSPTokenizer(self.tokenizer, self.vocab, lower=False)
def __init__(self, large, temp_dir, finetune=False): super(Bert, self).__init__() if (large): self.model = BertModel.from_pretrained('bert-large-uncased', cache_dir=temp_dir) else: #self.model = BertModel.from_pretrained('bert-base-uncased', cache_dir=temp_dir) vocab = get_kobert_vocab(temp_dir) self.model, _ = get_pytorch_kobert_model(cachedir=temp_dir) # add [BOS], [EOS] self.model.resize_token_embeddings(len(vocab)) self.finetune = finetune
def __init__(self, train_path, test_path, kaggle_path, use_all): device = torch.device("cuda:0") bertmodel, vocab = get_pytorch_kobert_model() self.model = BERTClassifier(bertmodel, dr_rate=0.5).to(device) dataset_train = nlp.data.TSVDataset(train_path, field_indices=[1, 2], num_discard_samples=1) dataset_test = nlp.data.TSVDataset(test_path, field_indices=[1, 2], num_discard_samples=1) dataset_kaggle = nlp.data.TSVDataset(kaggle_path, field_indices=[1], num_discard_samples=1, encoding='cp949') tokenizer = get_tokenizer() tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) data_train = BERTDataset(dataset_train, 0, 1, tok, config.max_len, True, False) data_test = BERTDataset(dataset_test, 0, 1, tok, config.max_len, True, False) data_kaggle = BERTDataset(dataset_kaggle, 0, 1, tok, config.max_len, True, False, kaggle=True) self.train_dataloader = torch.utils.data.DataLoader( data_train, batch_size=config.batch_size, num_workers=5) self.test_dataloader = torch.utils.data.DataLoader( data_test, batch_size=config.batch_size, num_workers=5) self.kaggle_dataloader = torch.utils.data.DataLoader(data_kaggle, batch_size=1, num_workers=5) if use_all: dataset_all = nlp.data.TSVDataset(config.all_path, field_indices=[1, 2], num_discard_samples=1) data_all = BERTDataset(dataset_all, 0, 1, tok, config.max_len, True, False) self.all_dataloader = torch.utils.data.DataLoader( data_all, batch_size=config.batch_size, num_workers=5, shuffle=True)
def __init__(self, hidden_size=768, num_classes=4, dr_rate=None, params=None): super(BERTClassifier, self).__init__() self.dr_rate = dr_rate self.device = torch.device( "cuda:0") if torch.cuda.is_available() else torch.device("cpu") self.bertmodel, _ = get_pytorch_kobert_model() self.classifier = nn.Linear(hidden_size, num_classes) if dr_rate: self.dropout = nn.Dropout(p=dr_rate)
def __init__(self, bert, hidden_size=768, num_classes=2, dr_rate=None, params=None): super(RNNClassifier, self).__init__() _, vocab = get_pytorch_kobert_model() self.dr_rate = dr_rate self.embedding = nn.Embedding(len(vocab.token_to_idx), 100) self.rnn = nn.RNN(100, hidden_size, batch_first=True) self.classifier = nn.Linear(hidden_size, num_classes) if dr_rate: self.dropout = nn.Dropout(p=dr_rate)
def __init__(self, tokenizer_s='spacy'): """ bert-multi, kbalbert : [PAD], [CLS], ... :param tokenizer: string to represent tokenizer like 'spacy', 'bert', ... Example:: nlp = English() tokenizer = nlp.Defaults.create_tokenizer(nlp) tokenizer = Tokenizer(tokenizer) """ if type(tokenizer_s) is str: self.tokenizer_s = tokenizer_s if tokenizer_s == 'spacy': self.nlp = spacy.load( "en_core_web_md") # md, large have embed vectors self.tokenizer = self.nlp.Defaults.create_tokenizer(self.nlp) elif tokenizer_s == 'bert-multi': from transformers import BertTokenizer, BertModel, BertConfig self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-cased') self.vocab = self.tokenizer.vocab elif tokenizer_s == 'sktkobert': import gluonnlp as nlp from kobert.utils import get_tokenizer from kobert.pytorch_kobert import get_pytorch_kobert_model kobert, vocab = get_pytorch_kobert_model() self.tokenizer = nlp.data.BERTSPTokenizer(get_tokenizer(), vocab, lower=False) self.vocab = vocab elif tokenizer_s == 'kbalbert': import sys sys.path.append( '/home/bwlee/work/codes/KB-ALBERT-KO/kb-albert-char/') from transformers import AlbertModel, TFAlbertModel from tokenization_kbalbert import KbAlbertCharTokenizer model_path = '/home/bwlee/work/codes/KB-ALBERT-KO/kb-albert-char/model' self.tokenizer = KbAlbertCharTokenizer.from_pretrained(model_path) self.vocab = self.tokenizer.vocab else: if type(tokenizer_s) is str: from transformers import BertTokenizer, BertModel, BertConfig self.tokenizer = BertTokenizer.from_pretrained(tokenizer_s) self.vocab = self.tokenizer.vocab elif type(tokenizer_s) is not str: self.tokenizer = tokenizer_s self.tokenizer_s = 'custom' else: raise Exception('check tokenizer is correctly defined') self.pre_trained = self.tokenizer_s
def bert_test(opt): device = torch.device('cuda:{}'.format(opt.device)) model = torch.load(opt.weights) model.to(device) # model = nn.DataParallel(model, output_device=[0,1]) bertmodel, vocab = get_pytorch_kobert_model() model.eval() # 평가 모드로 변경 def calc_accuracy(X, Y): max_vals, max_indices = torch.max(X, 1) train_acc = (max_indices == Y).sum().data.cpu().numpy() / max_indices.size()[0] return train_acc tokenizer = get_tokenizer() tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) max_len = 256 # 해당 길이를 초과하는 단어에 대해선 bert가 학습하지 않음 batch_size = opt.batch warmup_ratio = 0.1 num_epochs = 2 max_grad_norm = 1 log_interval = 200 learning_rate = 5e-5 dataset_test = nlp.data.TSVDataset(opt.source, field_indices=[1, 2], num_discard_samples=1) data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False) test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5) test_acc = 0.0 df = pd.DataFrame(columns=['pred', 'label']) pred = np.array([]) # answer = np.array([]) for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)): token_ids = token_ids.long().to(device) segment_ids = segment_ids.long().to(device) valid_length = valid_length # label = label.long().to(device) out = model(token_ids, valid_length, segment_ids) _, max_idx = torch.max(out, 1) pred = np.append(pred, max_idx.cpu().detach().tolist()) # answer = np.append(answer,label.cpu().detach().tolist()) # test_acc += calc_accuracy(out, label) # print(len(pred)) df['pred'] = pred # df['label'] = answer df.to_csv(opt.save_csv_name, index=False)
def __init__(self, vocab=None, tokenizer=None, maxlen=30, model_dir=Path('data_in')): if vocab is None or tokenizer is None: tok_path = get_tokenizer() self.ptr_tokenizer = SentencepieceTokenizer(tok_path) self.ptr_detokenizer = SentencepieceDetokenizer(tok_path) _, vocab_of_gluonnlp = get_pytorch_kobert_model() token2idx = vocab_of_gluonnlp.token_to_idx self.vocab = Vocabulary(token2idx=token2idx) self.tokenizer = Tokenizer(vocab=self.vocab, split_fn=self.ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=maxlen) else: self.vocab = vocab self.tokenizer = tokenizer self.maxlen = maxlen self.model_dir = model_dir
def __init__(self, config, num_classes, vocab=None) -> None: super(KobertBiLSTMCRF, self).__init__() if vocab is None: # pretraining model 사용 self.bert, self.vocab = get_pytorch_kobert_model() else: # finetuning model 사용 self.bert = BertModel(config=BertConfig.from_dict(bert_config)) self.vocab = vocab self._pad_id = self.vocab.token_to_idx[self.vocab.padding_token] self.dropout = nn.Dropout(config.dropout) self.bilstm = nn.LSTM(config.hidden_size, (config.hidden_size) // 2, dropout=config.dropout, batch_first=True, bidirectional=True) self.position_wise_ff = nn.Linear(config.hidden_size, num_classes) self.crf = CRF(num_tags=num_classes, batch_first=True)
def BERT_inference(text): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") _, vocab = get_pytorch_kobert_model(device) tokenizer = get_tokenizer() tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) max_len = 80 batch_size = 64 warmup_ratio = 0.1 num_epochs = 10 max_grad_norm = 1 log_interval = 30 learning_rate = 5e-5 # 1. 로드방법 : 학습한 파라미터만 Load하는 방법 #new_save_path = 'v3_model_only_parameter_0302.pt' #model = BERTClassifier(bertmodel, dr_rate=0.1) #model.load_state_dict(new_save_path) #model.eval() # 2. 로드방법 : 모델 전체 저장한것 Load save_path = 'v2_model_0302.pt' model = torch.load(save_path) model.eval() infer_data = BERTDataset_infer(text, 0, tok, max_len, True, False) infer_data = torch.tensor(next(iter(infer_data))[0]).reshape(1, -1) segments_tensors = torch.zeros(len(infer_data[0])) segments_tensors = segments_tensors.reshape(1, -1) valid_length = torch.tensor(len(infer_data[0])) valid_length = valid_length.reshape(1, -1) infer_data = infer_data.long().to(device) segments_tensors = segments_tensors.long().to(device) valid_length = valid_length.long().to(device) with torch.no_grad(): outputs = model(infer_data, valid_length, segments_tensors) print("딥러닝 최종 inference : ", torch.argmax(outputs[0])) return torch.argmax(outputs[0])
def __init__(self, vectorizer=None, tokenizer=None, dim_embed=200): """ :param tokenizer: KB """ self.vectorizer = vectorizer self.tokenizer = tokenizer self.pre_trained = pre_trained = vectorizer.pre_trained self.n_tag = self.vectorizer.n_tag if 'bert' in pre_trained.lower(): self.tag2vec = None import sys if pre_trained == 'bert-multi': from transformers import BertModel, BertConfig bert_config = BertConfig.from_pretrained( 'bert-base-multilingual-cased', output_hidden_states=True) self.bert = BertModel(bert_config).to(device) elif pre_trained == 'sktkobert': from kobert.pytorch_kobert import get_pytorch_kobert_model #sys.path.append('/home/bwlee/work/codes/sentence_similarity/kobert') #from pytorch_kobert3 import get_pytorch_kobert_model self.bert, _ = get_pytorch_kobert_model() self.bert = self.bert.to(device) elif pre_trained == 'kbalbert': sys.path.append( '/home/bwlee/work/codes/KB-ALBERT-KO/kb-albert-char/') from transformers import AlbertModel kbalbert_path = '/home/bwlee/work/codes/KB-ALBERT-KO/kb-albert-char/model' self.bert = AlbertModel.from_pretrained( kbalbert_path, output_hidden_states=True) self.bert = self.bert.to(device) else: from transformers import BertModel, BertConfig bert_config = BertConfig.from_pretrained( pre_trained, output_hidden_states=True) self.bert = BertModel(bert_config).to(device) else: self.tag2vec = self.vectorizer.tag2vec self.n_vocab = len(self.vectorizer.tag2vec) if pre_trained == '': self.embed = nn.Embedding(num_embeddings=self.n_tag, embedding_dim=dim_embed, padding_idx=self.tag2ix[PAD_TAG])
def make_model(N=6, d_model=768, d_ff=1024, h=8, dropout=0.1): # To copy Bert embedding layer, d_model should be the same for Generator. # Since d_model=768 in decoder is too big to train, d_ff is set from 3072 to 1024. (zzingae) "Helper: Construct a model from hyperparameters." c = copy.deepcopy attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) bert, vocab = get_pytorch_kobert_model() vocab_size = len(vocab) model = Chatbot( bert, Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), nn.Sequential(Embeddings(d_model, vocab_size), c(position)), nn.Sequential(Embeddings(d_model, vocab_size), c(position)), Generator(d_model, vocab_size)) return model, vocab
def get_loader(raw_data, max_len, batch_size=100, shuffle=False, user_map_dict=None, max_users=10): def collate_fn(data): return zip(*data) bertmodel, vocab = get_pytorch_kobert_model() tokenizer = kobert.utils.get_tokenizer() tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) dataset = CDMMBDataset(raw_data[0], raw_data[1], raw_data[2], tok, max_len, True, False, user_map_dict, max_users) data_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn) return data_loader
def __init__(self, path: str, max_seqlen: int = 512, ignore_index=-100) -> None: super(TokenizedDataset, self).__init__() with open(path, "rb") as f: self.data = pickle.load(f) self.max_len = max_seqlen _, self.vocab = get_pytorch_kobert_model() tok = get_tokenizer() self.tokenizer = nlp.data.BERTSPTokenizer(tok, self.vocab, lower=False) if "train" in path: self.data["token"] = self.data["token"][:100000] self.data["tgt"] = self.data["tgt"][:100000] self.tokens = self.data["token"] self.labels = self.data["tgt"] self.cls_idx = self.vocab["[CLS]"] self.pad_idx = self.vocab["[PAD]"] self.sep_idx = self.vocab["[SEP]"] self.mask_idx = self.vocab["[MASK]"] self.ignore_idx = ignore_index
def __init__( self, train_path: str = None, val_path: str = None, test_path: str = None, lr: float = None, warmup_percent: float = 0.1, train_batch_size: int = None, val_batch_size: int = None, num_classes: int = 2, num_workers: int = 2, gpus: int = 2, config: dict = bert_config, ) -> None: super(ContentSelector, self).__init__() self.save_hyperparameters() self.lr = self.hparams.lr self.lr_scale = 0 self.loss = nn.CrossEntropyLoss() self.bert, self.vocab = get_pytorch_kobert_model() self.dropout = nn.Dropout(self.hparams.config["hidden_dropout_prob"]) self.classifier = nn.Linear(self.hparams.config["hidden_size"], num_classes)
def main(args): root = args.path mode = args.mode dset = load_data(root, mode) _, vocab = get_pytorch_kobert_model() tokenizer = get_tokenizer() tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) tokenized_dset = [] start_time = time.time() for d in tqdm(dset): tokenized_dset.append(tokenize(d, tok)) print("--- %s seconds for tokenizing ---" % (time.time() - start_time)) start_time = time.time() result = {"token": [], "tgt": []} for idx, data in tqdm(enumerate(tokenized_dset)): src = " ".join([" ".join(d) for d in data["tokenized_src"]]).split(" ") tgt = " ".join([" ".join(d) for d in data["tokenized_abs"]]).split(" ") auxiliary_tgt = make_aux_tgt(src, tgt) assert len( src) == len(auxiliary_tgt ), f"Length mismatch: {len(src)}, {len(auxiliary_tgt)}" result["token"].append(src) result["tgt"].append(auxiliary_tgt) print("--- %s seconds for generating labels ---" % (time.time() - start_time)) with open(f"{args.save_path}/contentselection_{mode}.pickle", "wb") as f: pickle.dump(result, f) print("--- Finished ---")
def main(parser): # Config args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) # data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # Vocab & Tokenizer tok_path = get_tokenizer() # ./tokenizer_78b3253a26.model ptr_tokenizer = SentencepieceTokenizer(tok_path) _, vocab_of_gluonnlp = get_pytorch_kobert_model() token_to_idx = vocab_of_gluonnlp.token_to_idx model_config.vocab_size = len(token_to_idx) vocab = Vocabulary(token_to_idx=token_to_idx) print("len(token_to_idx): ", len(token_to_idx)) with open(model_dir / "token2idx_vocab.json", 'w', encoding='utf-8') as f: json.dump(token_to_idx, f, ensure_ascii=False, indent=4) # save vocab & tokenizer with open(model_dir / "vocab.pkl", 'wb') as f: pickle.dump(vocab, f) # load vocab & tokenizer with open(model_dir / "vocab.pkl", 'rb') as f: vocab = pickle.load(f) tokenizer = Tokenizer(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=keras_pad_fn, maxlen=model_config.maxlen) ner_formatter = NamedEntityRecognitionFormatter(vocab=vocab, tokenizer=tokenizer, maxlen=model_config.maxlen, model_dir=model_dir) # Train & Val Datasets cwd = Path.cwd() data_in = cwd / "data_in" train_data_dir = data_in / "NER-master" / "말뭉치 - 형태소_개체명" tr_clf_ds = NamedEntityRecognitionDataset(train_data_dir=train_data_dir, model_dir=model_dir) tr_clf_ds.set_transform_fn(transform_source_fn=ner_formatter.transform_source_fn, transform_target_fn=ner_formatter.transform_target_fn) tr_clf_dl = DataLoader(tr_clf_ds, batch_size=model_config.batch_size, shuffle=True, num_workers=4, drop_last=False) # Model model = KobertCRF(config=model_config, num_classes=len(tr_clf_ds.ner_to_index)) model.train() # optim train_examples_len = len(tr_clf_ds) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] # num_train_optimization_steps = int(train_examples_len / model_config.batch_size / model_config.gradient_accumulation_steps) * model_config.epochs t_total = len(tr_clf_dl) // model_config.gradient_accumulation_steps * model_config.epochs optimizer = AdamW(optimizer_grouped_parameters, lr=model_config.learning_rate, eps=model_config.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=model_config.warmup_steps, t_total=t_total) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') n_gpu = torch.cuda.device_count() # if n_gpu > 1: # model = torch.nn.DataParallel(model) model.to(device) # save tb_writer = SummaryWriter('{}/runs'.format(model_dir)) checkpoint_manager = CheckpointManager(model_dir) summary_manager = SummaryManager(model_dir) best_val_loss = 1e+10 best_train_acc = 0 # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(tr_clf_ds)) logger.info(" Num Epochs = %d", model_config.epochs) logger.info(" Instantaneous batch size per GPU = %d", model_config.batch_size) # logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", # args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", model_config.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 best_dev_acc, best_dev_loss = 0.0, 99999999999.0 best_steps = 0 model.zero_grad() set_seed() # Added here for reproductibility (even between python 2 and 3) # Train train_iterator = trange(int(model_config.epochs), desc="Epoch") for _epoch, _ in enumerate(train_iterator): epoch_iterator = tqdm(tr_clf_dl, desc="Iteration") # , disable=args.local_rank not in [-1, 0] epoch = _epoch for step, batch in enumerate(epoch_iterator): model.train() x_input, token_type_ids, y_real = map(lambda elm: elm.to(device), batch) log_likelihood, sequence_of_tags = model(x_input, token_type_ids, y_real) # loss: negative log-likelihood loss = -1 * log_likelihood if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if model_config.gradient_accumulation_steps > 1: loss = loss / model_config.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), model_config.max_grad_norm) tr_loss += loss.item() if (step + 1) % model_config.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 with torch.no_grad(): sequence_of_tags = torch.tensor(sequence_of_tags) print("sequence_of_tags: ", sequence_of_tags) print("y_real: ", y_real) print("loss: ", loss) print("(sequence_of_tags == y_real): ", (sequence_of_tags == y_real)) mb_acc = (sequence_of_tags == y_real).float()[y_real != vocab.PAD_ID].mean() tr_acc = mb_acc.item() tr_loss_avg = tr_loss / global_step tr_summary = {'loss': tr_loss_avg, 'acc': tr_acc} # if step % 50 == 0: print('epoch : {}, global_step : {}, tr_loss: {:.3f}, tr_acc: {:.2%}'.format(epoch + 1, global_step, tr_summary['loss'], tr_summary['acc'])) if model_config.logging_steps > 0 and global_step % model_config.logging_steps == 0: # Log metrics if model_config.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well pass tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / model_config.logging_steps, global_step) logger.info("Average loss: %s at global step: %s", str((tr_loss - logging_loss) / model_config.logging_steps), str(global_step)) logging_loss = tr_loss if model_config.save_steps > 0 and global_step % model_config.save_steps == 0: # Save model checkpoint output_dir = os.path.join(model_config.output_dir, 'epoch-{}'.format(epoch + 1)) if not os.path.exists(output_dir): os.makedirs(output_dir) logger.info("Saving model checkpoint to %s", output_dir) state = {'global_step': global_step + 1, 'model_state_dict': model.state_dict(), 'opt_state_dict': optimizer.state_dict()} summary = {'train': tr_summary} summary_manager.update(summary) summary_manager.save('summary.json') is_best = tr_acc >= best_train_acc # acc 기준 (원래는 train_acc가 아니라 val_acc로 해야) # Save if is_best: best_train_acc = tr_acc checkpoint_manager.save_checkpoint(state, 'best-epoch-{}-step-{}-acc-{:.3f}.bin'.format(epoch + 1, global_step, tr_acc)) else: torch.save(state, os.path.join(output_dir, 'model-epoch-{}-step-{}-acc-{:.3f}.bin'.format(epoch + 1, global_step, tr_acc))) tb_writer.close() logger.info(" global_step = %s, average loss = %s", global_step, tr_loss / global_step) return global_step, tr_loss / global_step, best_steps
import torch.optim as optim from torch.utils.data import Dataset, DataLoader import gluonnlp as nlp import numpy as np from tqdm import tqdm, tqdm_notebook from kobert.utils import get_tokenizer from kobert.pytorch_kobert import get_pytorch_kobert_model from transformers import AdamW from transformers.optimization import get_cosine_schedule_with_warmup ##GPU 사용 시 device = torch.device("cuda:0") bertmodel, vocab = get_pytorch_kobert_model() from google.colab import drive drive.mount('/content/drive') # 학습용 데이터셋 불러오기 import pandas as pd dataset_train1 = pd.read_csv('/content/drive/My Drive/Colab Notebooks/자연어처리/user_conversation.csv') dataset_train1.head() # 데이터 전처리 dataset_train1.drop(['Unnamed: 0', 'Unnamed: 0.1', '질문 제목', '작성 시간', '태그', 'url'], axis=1, inplace=True) dataset_train1.head() dataset_train1['질병명'].unique()
def main(argv): if FLAGS.model == 'BERT': tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') model = BertClassificationModel(input_path=FLAGS.input_path, model='bert-base-multilingual-cased', tokenizer=tokenizer, batch_size=FLAGS.batch_size, num_workers=FLAGS.num_workers, lr=FLAGS.lr, weight_decay=FLAGS.weight_decay, warm_up=FLAGS.warm_up) elif FLAGS.model == 'KoBERT': bertmodel, vocab = get_pytorch_kobert_model() tokenizer = nlp.data.BERTSPTokenizer(get_tokenizer(), vocab, lower=False) model = KoBertClassficationModel(input_path=FLAGS.input_path, model=bertmodel, tokenizer=tokenizer, batch_size=FLAGS.batch_size, num_workers=FLAGS.num_workers, lr=FLAGS.lr, weight_decay=FLAGS.weight_decay, warm_up=FLAGS.warm_up) elif FLAGS.model == 'KcBERT': tokenizer = BertTokenizer.from_pretrained('beomi/kcbert-large') model = BertClassificationModel(input_path=FLAGS.input_path, model='beomi/kcbert-large', tokenizer=tokenizer, batch_size=FLAGS.batch_size, num_workers=FLAGS.num_workers, lr=FLAGS.lr, weight_decay=FLAGS.weight_decay, warm_up=FLAGS.warm_up) else: raise ValueError('Unknown model type') seed_everything(42) checkpoint_callback = ModelCheckpoint( filepath=FLAGS.save_dir, save_top_k=1, monitor='val_loss', mode='min' ) early_stop = EarlyStopping( monitor='val_loss', patience=2, strict=False, verbose=False, mode='min' ) logger = TensorBoardLogger( save_dir=FLAGS.save_dir, name='logs_' + FLAGS.model, version=FLAGS.version ) lr_monitor = LearningRateMonitor(logging_interval='step') if FLAGS.config_path is not None: parser = ConfigParser() parser.read(FLAGS.config_path) @telegram_sender(token=parser.get('telegram', 'token'), chat_id=parser.get('telegram', 'chat_id')) def train_notify(trainer: Trainer = None, model: Union[BertClassificationModel, KoBertClassficationModel] = None) -> None: trainer.fit(model) if FLAGS.cuda_device > 1: trainer = Trainer(deterministic=True, gpus=FLAGS.cuda_device, distributed_backend='ddp', log_gpu_memory=True, checkpoint_callback=checkpoint_callback, early_stop_callback=early_stop, max_epochs=FLAGS.max_epochs, logger=logger, callbacks=[lr_monitor]) logging.info(f'There are {torch.cuda.device_count()} GPU(s) available.') logging.info(f'Use the number of GPU: {FLAGS.cuda_device}') elif FLAGS.cuda_device == 1: trainer = Trainer(deterministic=True, gpus=FLAGS.cuda_device, log_gpu_memory=True, checkpoint_callback=checkpoint_callback, early_stop_callback=early_stop, max_epochs=FLAGS.max_epochs, logger=logger, callbacks=[lr_monitor]) logging.info(f'There are {torch.cuda.device_count()} GPU(s) available.') logging.info(f'Use the number of GPU: {FLAGS.cuda_device}') else: trainer = Trainer(deterministic=True, checkpoint_callback=checkpoint_callback, early_stop_callback=early_stop, max_epochs=FLAGS.max_epochs, logger=logger, callbacks=[lr_monitor]) logging.info('No GPU available, using the CPU instead.') if FLAGS.config_path is not None: train_notify(trainer=trainer, model=model) else: trainer.fit(model)
def run_model(): device = torch.device("cuda:0") bertmodel, vocab = get_pytorch_kobert_model() tokenizer = get_tokenizer() tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False) max_len = 64 batch_size = 64 warmup_ratio = 0.1 num_epochs = 2 max_grad_norm = 1 log_interval = 200 learning_rate = 5e-5 dataset_train, dataset_test = train_test_split(dtls, test_size=0.2, random_state=123) data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False) data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False) train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=0) test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=0) model = BERTClassifier(bertmodel, dr_rate=0.5).to(device) #��Ƽ�������� �ս��Լ� ���� optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate) loss_fn = nn.CrossEntropyLoss() t_total = len(train_dataloader) * num_epochs warmup_step = int(t_total * warmup_ratio) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total) #�н� ���� for e in range(num_epochs): train_acc = 0.0 test_acc = 0.0 model.train() for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(train_dataloader)): optimizer.zero_grad() token_ids = token_ids.long().to(device) segment_ids = segment_ids.long().to(device) valid_length= valid_length label = label.long().to(device) out = model(token_ids, valid_length, segment_ids) loss = loss_fn(out, label) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule train_acc += calc_accuracy(out, label) if batch_id % log_interval == 0: print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1))) print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1))) model.eval() #�� �� �κ� for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)): token_ids = token_ids.long().to(device) segment_ids = segment_ids.long().to(device) valid_length= valid_length label = label.long().to(device) out = model(token_ids, valid_length, segment_ids) test_acc += calc_accuracy(out, label) print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1))) return model