def __init__(self, args): pretrain_name = 'bert-base-cased' if args.model_info.bert_path: pretrain_name = args.model_info.bert_path print(f"Tokenizer from:{pretrain_name}") train_conf = args.train_info model_conf = args.model_info self.model_type = model_conf.model if self.model_type == 'bert_seq': self.model = BertClassifier(model_conf) self.tokenizer = BertTokenizer.from_pretrained(pretrain_name) self.ds = SentimentDataset if self.model_type == 'GPT2': self.model = GPT2Classifier(model_conf) self.tokenizer = GPT2Tokenizer.from_pretrained(pretrain_name) self.ds = GPT2Dataset self.model.load_state_dict(torch.load(train_conf.model_path)) self.device = train_conf.device self.class_num = model_conf.class_num self.model.to(self.device) self.lr = train_conf.lr self.max_len = train_conf.max_seq_len self.conf = args self.label_map = json.load(open(args.label_map_path)) self.id2label = dict([(i, label_str) for label_str, i in self.label_map.items()])
def __init__(self, config, RecognitionResultManager): self.config = config self.genre_keywords_db = self.get_db(self.config['DB']['host'], self.config['DB']['db_name'], self.config['DB']['user']) self.rrm = RecognitionResultManager self.model_path = "/Users/jinsakuma/Downloads/model_gpu_v4.3.3.pth" self.model_config = BertConfig.from_pretrained( 'cl-tohoku/bert-base-japanese-whole-word-masking', output_attentions=True) self.tokenizer = BertJapaneseTokenizer.from_pretrained( 'cl-tohoku/bert-base-japanese-whole-word-masking') self.bert_model = BertModel.from_pretrained( 'cl-tohoku/bert-base-japanese-whole-word-masking', config=self.model_config) self.model = BertClassifier(self.bert_model) self.max_len = 30 self.load_weights(self.model_path) self.device = torch.device("cpu") self.order_list = [ 'recommendation', 'title', 'abstract', 'review', 'evaluation', 'actor', 'genre', 'director', None ]
class Predictor(object): def __init__(self, args): pretrain_name = 'bert-base-cased' if args.model_info.bert_path: pretrain_name = args.model_info.bert_path self.tokenizer = BertTokenizer.from_pretrained(pretrain_name) print(f"Tokenizer from:{pretrain_name}") train_conf = args.train_info model_conf = args.model_info self.device = train_conf.device self.class_num = model_conf.class_num self.model = BertClassifier(model_conf) self.model.load_state_dict( torch.load(train_conf.model_path, map_location=torch.device(self.device))) self.model.to(self.device) self.lr = train_conf.lr self.max_len = train_conf.max_seq_len self.conf = args self.label_map = json.load(open(args.label_map_path)) self.id2label = dict([(i, label_str) for label_str, i in self.label_map.items()]) self.softmax = Softmax(dim=1) def predict(self, sens): d_loader = self.sen_2_dl(sens) y_pred = list() with torch.no_grad(): for batch in d_loader: input_ids = batch['input_ids'].to(self.device) attention_mask = batch['attention_mask'].to(self.device) logits = self.model(input_ids, attention_mask) logits = torch.sigmoid(logits) y_pred.append(logits) y_pred = torch.cat(y_pred) y_pred = y_pred.cpu().numpy() res = list() for y in y_pred: res.append(self._score_2_dict(y)) return res def _score_2_dict(self, single_pred): res = dict() for i, score in enumerate(single_pred): label_str = self.id2label[i] res[label_str] = float(score) return res def sen_2_dl(self, sens): texts = [i.strip() for i in sens] labels = [ 999 ] # this is a invalid parameter but dataloader needs the this ds = SentimentDataset(self.tokenizer, texts, labels, self.max_len) _loader = dataloader.DataLoader( ds, batch_size=self.conf.train_info.batch_size, shuffle=False) return _loader
def create_model(self, num_train_step, num_warmup_step): """ 根据config文件选择对应的模型,并初始化 :return: """ model = BertClassifier(config=self.config, num_train_step=num_train_step, num_warmup_step=num_warmup_step) return model
def __init__(self, args): pretrain_name = 'bert-base-cased' if args.model_info.bert_path: pretrain_name = args.model_info.bert_path self.tokenizer = BertTokenizer.from_pretrained(pretrain_name) print(f"Tokenizer from:{pretrain_name}") train_conf = args.train_info model_conf = args.model_info self.device = train_conf.device self.class_num = model_conf.class_num self.model = BertClassifier(model_conf) self.model.load_state_dict( torch.load(train_conf.model_path, map_location=torch.device(self.device))) self.model.to(self.device) self.lr = train_conf.lr self.max_len = train_conf.max_seq_len self.conf = args self.label_map = json.load(open(args.label_map_path)) self.id2label = dict([(i, label_str) for label_str, i in self.label_map.items()]) self.softmax = Softmax(dim=1)
class Trainer(object): def __init__(self, config): self.config = config self.data_processor = DataProcessor("/Users/a5560648/workspace/tutor/data", max_len=config["max_len"]) self.model = BertClassifier(config=config) def train(self): data_loader = DataLoader(self.data_processor.get_dataset(), batch_size=config["batch_size"], shuffle=True, drop_last=True) optimizer = torch.optim.Adam(self.model.parameters(), lr=self.config["lr"]) loss_fn = torch.nn.functional.cross_entropy for epoch in range(self.config["epoch"]): with tqdm(total=len(data_loader)) as pbar: for input_ids, token_type_ids, attention_mask, labels in data_loader: optimizer.zero_grad() output = self.model(input_ids, token_type_ids, attention_mask) loss = loss_fn(output, labels) loss.backward() optimizer.step() pbar.update(1)
class NLU: def __init__(self, config, RecognitionResultManager): self.config = config self.genre_keywords_db = self.get_db(self.config['DB']['host'], self.config['DB']['db_name'], self.config['DB']['user']) self.rrm = RecognitionResultManager self.model_path = "/Users/jinsakuma/Downloads/model_gpu_v4.3.3.pth" self.model_config = BertConfig.from_pretrained( 'cl-tohoku/bert-base-japanese-whole-word-masking', output_attentions=True) self.tokenizer = BertJapaneseTokenizer.from_pretrained( 'cl-tohoku/bert-base-japanese-whole-word-masking') self.bert_model = BertModel.from_pretrained( 'cl-tohoku/bert-base-japanese-whole-word-masking', config=self.model_config) self.model = BertClassifier(self.bert_model) self.max_len = 30 self.load_weights(self.model_path) self.device = torch.device("cpu") self.order_list = [ 'recommendation', 'title', 'abstract', 'review', 'evaluation', 'actor', 'genre', 'director', None ] def get_db(self, host="localhost", db_name="woz_system", user="******"): ''' MySQLから発話内容を一括取得 :return: db (dict) ''' connector = MySQLdb.connect(host=host, db=db_name, user=user, passwd="", charset="utf8") cursor = connector.cursor() # カーソル(概念)を作成 # 映画推薦用キーワード cursor.execute('select * from genre') genres = cursor.fetchall() genre_keywords_db = {} for genre in genres: genre_id = genre[1] genre_type = genre[2] # .encode('utf-8') genre_keywords_db[genre_type] = [] cursor.execute( 'select keywords from genre_keywords where genre_id={}'.format( genre_id)) keywords = cursor.fetchall() keyword_list = keywords[0][0].split(",") genre_keywords_db[genre_type] = keyword_list return genre_keywords_db def load_weights(self, model_path): load_weights = torch.load(model_path, map_location={'cuda:0': 'cpu'}) self.model.load_state_dict(load_weights) def bert_tokenizer(self, input_text): return self.tokenizer.encode(input_text, max_length=self.max_len, truncation=True, return_tensors='pt')[0] def get_order(self, input_text): token = self.bert_tokenizer(input_text) token = token.unsqueeze(0) output, attentions = self.model(token.to(self.device)) _, pred = torch.max(output, 1) print("NLU result: ", self.order_list[pred.item()]) return self.order_list[pred.item()] def get_text(self, N): df = self.rrm.get_df() text_list = df['transcript'].iloc[-N:].tolist() target_list = df['speaker'].iloc[-N:].tolist() return text_list, target_list def check_genre(self, input_texts): # キーワードマッチング for text in reversed(input_texts): for response_type, keywords in self.genre_keywords_db.items(): for keyword in keywords: if keyword in text: return response_type return None
def main(args, f): set_seed(args.train_seed) if args.model in ['roberta', 'distilroberta']: tokenizer = RobertaTokenizer.from_pretrained('roberta-base') else: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # preprocess data src_eval_loader, src_loader, tgt_all_loader, tgt_train_loader, tgt_te = get_all_dataloader( args, tokenizer) # load models if args.model == 'bert': src_encoder = BertEncoder() # encoder = BertEncoder() classifier = BertClassifier() elif args.model == 'distilbert': src_encoder = DistilBertEncoder() # encoder = DistilBertEncoder() classifier = BertClassifier() elif args.model == 'roberta': src_encoder = RobertaEncoder() # encoder = RobertaEncoder() classifier = RobertaClassifier() else: src_encoder = DistilRobertaEncoder() # encoder = DistilRobertaEncoder() classifier = RobertaClassifier() discriminator = Discriminator() # parallel models if torch.cuda.device_count() > 1: print('Let\'s use {} GPUs!'.format(torch.cuda.device_count())) src_encoder = nn.DataParallel(src_encoder) classifier = nn.DataParallel(classifier) # encoder = nn.DataParallel(encoder) discriminator = nn.DataParallel(discriminator) if args.load: src_encoder = init_model(args, src_encoder, restore_path=param.src_encoder_path) classifier = init_model(args, classifier, restore_path=param.src_classifier_path) # encoder = init_model(args, encoder, restore_path=param.tgt_encoder_path) # discriminator = init_model(args, discriminator, restore_path=param.d_model_path) else: src_encoder = init_model(args, src_encoder) classifier = init_model(args, classifier) # encoder = init_model(args, encoder) discriminator = init_model(args, discriminator) # train source model if args.pretrain: print("=== Training classifier for source domain ===") src_encoder, classifier = pretrain(args, src_encoder, classifier, src_loader) # save pretrained model # save_model(args, src_encoder, param.src_encoder_path) # save_model(args, classifier, param.src_classifier_path) # eval source model print("=== Evaluating classifier for source domain ===") evaluate(args, src_encoder, classifier, src_loader) src_acc = evaluate(args, src_encoder, classifier, tgt_all_loader) f.write(f'{args.src} -> {args.tgt}: No adapt acc on src data: {src_acc}\n') # adapt print("=== Adapt tgt encoder ===") # encoder.load_state_dict(src_encoder.state_dict()) # if args.src_free: # s_res_features = src_gmm(args, src_encoder, src_loader) # src_loader = s_numpy_dataloader(s_res_features, args.batch_size) # encoder = aad_adapt_src_free(args, src_encoder, encoder, discriminator, # classifier, src_loader, tgt_train_loader, tgt_all_loader) # else: if args.adapt: encoder, classifier = shot_adapt(args, src_encoder, classifier, tgt_train_loader, tgt_all_loader, tgt_te) # save_model(args, encoder, param.tgt_encoder_path) # argument setting # print("=== Argument Setting ===") print( f"model_type: {args.model}; max_seq_len: {args.max_seq_length}; batch_size: {args.batch_size}; " f"pre_epochs: {args.pre_epochs}; num_epochs: {args.num_epochs}; src: {args.src}; tgt: {args.tgt}; " f'src_free: {args.src_free}; dp: {args.dp}') # eval target encoder on lambda0.1 set of target dataset print("=== Evaluating classifier for encoded target domain ===") print(">>> domain adaption <<<") tgt_acc = evaluate(args, encoder, classifier, tgt_all_loader) f.write(f'{args.src} -> {args.tgt}: DA acc on tgt data: {tgt_acc}\n') f.write( f"model_type: {args.model}; batch_size: {args.batch_size}; pre_epochs: {args.pre_epochs}; " f"num_epochs: {args.num_epochs}; src_free: {args.src_free}; src: {args.src}; " f"tgt: {args.tgt}; dp: {args.dp}\n\n")
def __init__(self, config): self.config = config self.data_processor = DataProcessor("/Users/a5560648/workspace/tutor/data", max_len=config["max_len"]) self.model = BertClassifier(config=config)
def main(): args = parse_arguments() # argument setting print("=== Argument Setting ===") print("src: " + args.src) print("tgt: " + args.tgt) print("alpha: " + str(args.alpha)) print("seed: " + str(args.seed)) print("train_seed: " + str(args.train_seed)) print("model_type: " + str(args.model)) print("max_seq_length: " + str(args.max_seq_length)) print("batch_size: " + str(args.batch_size)) print("num_epochs: " + str(args.num_epochs)) set_seed(args.train_seed) if args.model == 'roberta': tokenizer = RobertaTokenizer.from_pretrained('roberta-base') else: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # preprocess data print("=== Processing datasets ===") if args.src == 'blog': src_x, src_y = CSV2Array(os.path.join('data', args.src, 'blog.csv')) elif args.src == 'airline': src_x, src_y = CSV2Array(os.path.join('data', args.src, 'airline.csv')) else: src_x, src_y = XML2Array( os.path.join('data', args.src, 'negative.review'), os.path.join('data', args.src, 'positive.review')) src_x, src_test_x, src_y, src_test_y = train_test_split( src_x, src_y, test_size=0.2, stratify=src_y, random_state=args.seed) if args.tgt == 'blog': tgt_x, tgt_y = CSV2Array(os.path.join('data', args.tgt, 'blog.csv')) elif args.tgt == 'airline': tgt_x, tgt_y = CSV2Array(os.path.join('data', args.tgt, 'airline.csv')) else: tgt_x, tgt_y = XML2Array( os.path.join('data', args.tgt, 'negative.review'), os.path.join('data', args.tgt, 'positive.review')) tgt_train_x, _, tgt_train_y, _ = train_test_split(tgt_x, tgt_y, test_size=0.2, stratify=tgt_y, random_state=args.seed) if args.model == 'roberta': src_features = roberta_convert_examples_to_features( src_x, src_y, args.max_seq_length, tokenizer) src_test_features = roberta_convert_examples_to_features( src_test_x, src_test_y, args.max_seq_length, tokenizer) tgt_features = roberta_convert_examples_to_features( tgt_train_x, tgt_train_y, args.max_seq_length, tokenizer) tgt_all_features = roberta_convert_examples_to_features( tgt_x, tgt_y, args.max_seq_length, tokenizer) else: src_features = convert_examples_to_features(src_x, src_y, args.max_seq_length, tokenizer) src_test_features = convert_examples_to_features( src_test_x, src_test_y, args.max_seq_length, tokenizer) tgt_features = convert_examples_to_features(tgt_train_x, tgt_train_y, args.max_seq_length, tokenizer) tgt_all_features = convert_examples_to_features( tgt_x, tgt_y, args.max_seq_length, tokenizer) # load dataset src_data_loader = get_data_loader(src_features, args.batch_size) src_data_loader_eval = get_data_loader(src_test_features, args.batch_size) tgt_data_loader = get_data_loader(tgt_features, args.batch_size) tgt_data_loader_all = get_data_loader(tgt_all_features, args.batch_size) # load models if args.model == 'bert': encoder = BertEncoder() cls_classifier = BertClassifier() dom_classifier = DomainClassifier() elif args.model == 'distilbert': encoder = DistilBertEncoder() cls_classifier = BertClassifier() dom_classifier = DomainClassifier() else: encoder = RobertaEncoder() cls_classifier = RobertaClassifier() dom_classifier = RobertaDomainClassifier() if args.load: encoder = init_model(encoder, restore=param.encoder_path) cls_classifier = init_model(cls_classifier, restore=param.cls_classifier_path) dom_classifier = init_model(dom_classifier, restore=param.dom_classifier_path) else: encoder = init_model(encoder) cls_classifier = init_model(cls_classifier) dom_classifier = init_model(dom_classifier) print("=== Start Training ===") if args.train: encoder, cls_classifier, dom_classifier = train( args, encoder, cls_classifier, dom_classifier, src_data_loader, src_data_loader_eval, tgt_data_loader, tgt_data_loader_all) print("=== Evaluating classifier for encoded target domain ===") print(">>> after training <<<") evaluate(encoder, cls_classifier, tgt_data_loader_all)
def main(paras): logger = logging.getLogger(__name__) if paras.save_log_file: logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=paras.logging_level, filename=f'{paras.log_save_path}/{paras.train_log_file}', filemode='w') else: logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=paras.logging_level, ) device = 'cuda' if torch.cuda.is_available() else 'cpu' logger.info(f'Loading model: {paras.model_name}') tokenizer = BertTokenizer.from_pretrained(paras.model_name) bert = BertModel.from_pretrained(paras.model_name) train_dataset = RE_Dataset(paras, 'train') train_dataloaer = DataLoader(train_dataset, batch_size=paras.batch_size, shuffle=paras.shuffle, drop_last=paras.drop_last) label_to_index = train_dataset.label_to_index special_token_list = list(train_dataset.special_token_set) # fixme: add special token to tokenizer special_tokens_dict = {'additional_special_tokens': special_token_list} tokenizer.add_special_tokens(special_tokens_dict) # bert.resize_token_embeddings(len(tokenizer)) test_dataset = RE_Dataset(paras, 'test') test_dataloader = DataLoader(test_dataset, batch_size=paras.batch_size, shuffle=paras.shuffle, drop_last=paras.drop_last) bert_classifier = BertClassifier(bert, paras.hidden_size, paras.label_number, paras.dropout_prob) if paras.optimizer == 'adam': logger.info('Loading Adam optimizer.') optimizer = torch.optim.Adam(bert_classifier.parameters(), lr=paras.learning_rate) elif paras.optimizer == 'adamw': logger.info('Loading AdamW optimizer.') no_decay = [ 'bias', 'LayerNorm.weight' ] optimizer_grouped_parameters = [ {'params': [ p for n, p in bert_classifier.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01}, {'params': [ p for n, p in bert_classifier.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=paras.learning_rate, eps=args.adam_epsilon) else: logger.warning(f'optimizer must be "Adam" or "AdamW", but got {paras.optimizer}.') logger.info('Loading Adam optimizer.') optimizer = torch.optim.Adam(bert_classifier.parameters(), lr=paras.learning_rate) logger.info('Training Start.') best_eval = {'acc': 0, 'precision': 0, 'recall': 0, 'f1': 0, 'loss': 0} for epoch in range(paras.num_train_epochs): epoch_loss = 0 bert_classifier.train() for step, batch in enumerate(train_dataloaer): optimizer.zero_grad() batch_data, batch_label = batch encoded_data = tokenizer(batch_data, padding=True, truncation=True, return_tensors='pt', max_length=paras.max_sequence_length) label_tensor = batch_label_to_idx(batch_label, label_to_index) loss = bert_classifier(encoded_data, label_tensor) epoch_loss += loss_to_int(loss) logging.info(f'epoch: {epoch}, step: {step}, loss: {loss:.4f}') # fixme: del # acc, precision, recall, f1 = evaluation(bert_classifier, tokenizer, test_dataloader, # paras.max_sequence_length, label_to_index) # logger.info(f'Accuracy: {acc:.4f}, Precision: {precision:.4f}, ' # f'Recall: {recall:.4f}, F1-score: {f1:.4f}') loss.backward() optimizer.step() epoch_loss = epoch_loss / len(train_dataloaer) acc, precision, recall, f1 = evaluation(bert_classifier, tokenizer, test_dataloader, paras.max_sequence_length, label_to_index) logging.info(f'Epoch: {epoch}, Epoch-Average Loss: {epoch_loss:.4f}') logger.info(f'Accuracy: {acc:.4f}, Precision: {precision:.4f}, ' f'Recall: {recall:.4f}, F1-score: {f1:.4f}') if best_eval['loss'] == 0 or f1 > best_eval['f1']: best_eval['loss'] = epoch_loss best_eval['acc'] = acc best_eval['precision'] = precision best_eval['recall'] = recall best_eval['f1'] = f1 torch.save(bert_classifier, f'{paras.log_save_path}/{paras.model_save_name}') with open(f'{paras.log_save_path}/{paras.checkpoint_file}', 'w') as wf: wf.write(f'Save time: {time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}\n') wf.write(f'Best F1-score: {best_eval["f1"]:.4f}\n') wf.write(f'Precision: {best_eval["precision"]:.4f}\n') wf.write(f'Recall: {best_eval["recall"]:.4f}\n') wf.write(f'Accuracy: {best_eval["acc"]:.4f}\n') wf.write(f'Epoch-Average Loss: {best_eval["loss"]:.4f}\n') logger.info(f'Updated model, best F1-score: {best_eval["f1"]:.4f}\n') logger.info(f'Train complete, Best F1-score: {best_eval["f1"]:.4f}.')
# coding: utf-8 # @File: predict.py # @Author: HE D.H. # @Email: [email protected] # @Time: 2020/10/10 17:13:57 # @Description: import torch from model import BertClassifier from transformers import BertTokenizer, BertConfig labels = ['体育', '娱乐', '家居', '房产', '教育', '时尚', '时政', '游戏', '科技', '财经'] bert_config = BertConfig.from_pretrained('bert-base-chinese') model = BertClassifier(bert_config, len(labels)) model.load_state_dict( torch.load('models/best_model.pkl', map_location=torch.device('cpu'))) tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') print('新闻类别分类') while True: text = input('Input: ') token = tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True, max_length=512) input_ids = token['input_ids'] attention_mask = token['attention_mask'] token_type_ids = token['token_type_ids']
attention_mask = (x != 0).float().to(config.DEVICE).long() outputs = MODEL(x, attention_mask=attention_mask) return outputs.cpu().detach().numpy() @app.route('/predict') def predict(): comment = request.args.get('comment') start_time = time.time() prediction = comment_prediction(comment) response = { 'response': { label: str(prob) for label, prob in zip(config.CLASS_COLS, prediction[0]) } } response['response']['comment'] = comment response['response']['time_taken'] = str(time.time() - start_time) return flask.jsonify(response) if __name__ == '__main__': bert_config = BertConfig.from_pretrained(config.BERT_NAME) bert_config.num_labels = config.NUM_CLASSES MODEL = BertClassifier(bert_config) MODEL.load_state_dict(torch.load(config.TRAINED_MODEL_PATH)) MODEL.to(config.DEVICE) MODEL.eval() app.run(host=config.HOST, port=config.PORT)
def train(dataloader, head_trans, body_trans, classifier, load_model=False, save_model=True, num_epochs=2): torch.backends.cudnn.benchmark = True # device = 'cuda' if torch.cuda.is_available() else 'cpu' device = 'cpu' print(device) load_model = load_model save_model = save_model learning_rate = 3e-3 num_epochs = num_epochs # For tensorboard writer = SummaryWriter('runs/bert') step = 0 # Initialize Model model = BertClassifier(head_trans, body_trans, classifier).to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate) if load_model: model, optimizer, step = load_checkpoint( torch.load('bert_chkpnt/my_checkpoint.pth.tar'), model, optimizer) return model for epoch in range(num_epochs): if save_model: checkpoint = { 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'step': step } save_checkpoint(checkpoint) loop = tqdm(enumerate(dataloader), total=len(dataloader), leave=False) for batch, (head, body, stance) in loop: outputs = model(head.to(device), body.to(device)) breakpoint() loss = criterion(outputs.float(), stance.to(device).long()) writer.add_scalar('Training Loss', loss.item(), step) step += 1 optimizer.zero_grad() loss.backward() optimizer.step() # Update progress bar loop.set_description(f'Epoch [{epoch+1}/{num_epochs}]') loop.set_postfix(loss=loss.item()) running_loss += loss.item() running_accuracy += ( (torch.argmax(outputs, dim=1) == stance.to(device)).sum().item()) / BATCH_SIZE if (batch + 1) % 10 == 0: writer.add_scalar('Running Loss', running_loss / 10, epoch * len(dataloader) + batch) writer.add_scalar('Running Accuracy', running_accuracy / 10, epoch * len(dataloader) + batch) running_loss = 0.0 running_accuracy = 0 return model
def main(): device = torch.device('cuda:3') # 获取到dataset print('加载训练数据') train_data = load_data('dataset/train.csv') print('加载验证数据') valid_data = load_data('dataset/test.csv') # test_data = load_data('cnews/cnews.test.txt') batch_size = 16 # 生成Batch print('生成batch') train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=3) valid_dataloader = DataLoader(valid_data, batch_size=batch_size, shuffle=False, num_workers=3) # test_dataloader = DataLoader(valid_data, batch_size=batch_size, shuffle=False) # 读取BERT的配置文件 bert_config = BertConfig.from_pretrained('./chinese_wwm_pytorch') bert_config.num_labels = num_labels print(bert_config) # 初始化模型 model = BertClassifier(bert_config) # model.to(device) # 参数设置 EPOCHS = 20 learning_rate = 5e-6 # Learning Rate不宜太大 optimizer = AdamW(model.parameters(), lr=learning_rate) # 损失函数采用交叉熵 criterion = nn.CrossEntropyLoss() with open('output.txt', 'w') as wf: wf.write('Batch Size: ' + str(batch_size) + '\tLearning Rate: ' + str(learning_rate) + '\n') best_acc = 0 # 设置并行训练,模型默认是把参数放在device[0]对应的gpu编号的gpu上,所以这里应该和上面设置的cuda:2对应 net = torch.nn.DataParallel(model, device_ids=[3, 4]) net.to(device) # model.module.avgpool = nn.AdaptiveAvgPool2d(7) # 开始训练 for Epoch in range(1, EPOCHS + 1): losses = 0 # 损失 accuracy = 0 # 准确率 print('Epoch:', Epoch) model.train() for batch_index, batch in enumerate(train_dataloader): # print(batch_index) # print(batch) input_ids = batch[0].to(device) attention_mask = batch[1].to(device) token_type_ids = batch[2].to(device) label_ids = batch[3].to(device) # 将三个输入喂到模型中 output = net( # forward input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, ) loss = criterion(output, label_ids) losses += loss.item() pred_labels = torch.argmax(output, dim=1) # 预测出的label acc = torch.sum(pred_labels == label_ids.to(device)).item() / len( pred_labels) # acc accuracy += acc # 打印训练过程中的准确率以及loss # print('Epoch: %d | Train: | Batch: %d / %d | Acc: %f | Loss: %f' % (Epoch, batch_index + 1, len(train_dataloader), acc, loss.item())) # 模型梯度置零,损失函数反向传播,优化更新 model.zero_grad() loss.backward() optimizer.step() # torch.cuda.empty_cache() average_loss = losses / len(train_dataloader) average_acc = accuracy / len(train_dataloader) # 打印该epoch训练结果的 print('\tTrain ACC:', average_acc, '\tLoss:', average_loss) # with open('output.txt', 'a') as rf: # output_to_file = '\nEpoch: ' + str(Epoch) + '\tTrain ACC:' + str(average_acc) + '\tLoss: ' + str( # average_loss) # rf.write(output_to_file) # 验证 model.eval() losses = 0 # 损失 accuracy = 0 # 准确率 # 在验证集上进行验证 for batch_index, batch in enumerate(valid_dataloader): input_ids = batch[0].to(device) attention_mask = batch[1].to(device) token_type_ids = batch[2].to(device) label_ids = batch[3].to(device) with torch.no_grad(): output = model( # forward input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, ) loss = criterion(output, label_ids) losses += loss.item() # 这里的两部操作都是直接对生成的结果张量进行操作 pred_labels = torch.argmax(output, dim=1) # 预测出的label acc = torch.sum(pred_labels == label_ids.to(device)).item() / len( pred_labels) # acc accuracy += acc average_loss = losses / len(valid_dataloader) average_acc = accuracy / len(valid_dataloader) print('\tValid ACC:', average_acc, '\tLoss:', average_loss) # with open('output.txt', 'a') as rf: # output_to_file = '\nEpoch: ' + str(Epoch) + '\tValid ACC:' + str(average_acc) + '\tLoss: ' + str( # average_loss) + '\n' # rf.write(output_to_file) if average_acc > best_acc: best_acc = average_acc torch.save(model.state_dict(), 'best_model_on_trainset.pkl')
def main(args, f): # args = parse_arguments() set_seed(args.train_seed) if args.model in ['roberta', 'distilroberta']: tokenizer = RobertaTokenizer.from_pretrained('roberta-base') else: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # preprocess data src_eval_loader, src_loader, tgt_all_loader, tgt_train_loader = get_all_dataloader( args, tokenizer) # load models if args.model == 'bert': src_encoder = BertEncoder() tgt_encoder = BertEncoder() src_classifier = BertClassifier() elif args.model == 'distilbert': src_encoder = DistilBertEncoder() tgt_encoder = DistilBertEncoder() src_classifier = BertClassifier() elif args.model == 'roberta': src_encoder = RobertaEncoder() tgt_encoder = RobertaEncoder() src_classifier = RobertaClassifier() else: src_encoder = DistilRobertaEncoder() tgt_encoder = DistilRobertaEncoder() src_classifier = RobertaClassifier() discriminator = Discriminator() # output dims is 2 instead of 1 if args.load: src_encoder = init_model(args, src_encoder, restore_path=param.src_encoder_path) src_classifier = init_model(args, src_classifier, restore_path=param.src_classifier_path) tgt_encoder = init_model(args, tgt_encoder, restore_path=param.tgt_encoder_path) discriminator = init_model(args, discriminator, restore_path=param.d_model_path) else: src_encoder = init_model(args, src_encoder) src_classifier = init_model(args, src_classifier) tgt_encoder = init_model(args, tgt_encoder) discriminator = init_model(args, discriminator) # parallel models if torch.cuda.device_count() > 1: print('Let\'s use {} GPUs!'.format(torch.cuda.device_count())) src_encoder = nn.DataParallel(src_encoder) src_classifier = nn.DataParallel(src_classifier) tgt_encoder = nn.DataParallel(tgt_encoder) discriminator = nn.DataParallel(discriminator) # train source model print("=== Training classifier for source domain ===") if args.pretrain: src_encoder, src_classifier = pretrain(args, src_encoder, src_classifier, src_loader) # eval source model print("=== Evaluating classifier for source domain ===") evaluate(args, src_encoder, src_classifier, src_loader) src_acc = evaluate(args, src_encoder, src_classifier, tgt_all_loader) f.write(f'{args.src} -> {args.tgt}: No adapt acc on src data: {src_acc}\n') for params in src_encoder.parameters(): params.requires_grad = False # train target encoder by ADDA print("=== Training encoder for target domain ===") if args.adapt: tgt_encoder.load_state_dict(src_encoder.state_dict()) tgt_encoder = adda_adapt(args, src_encoder, tgt_encoder, discriminator, src_loader, tgt_train_loader) # argument setting print( f"model_type: {args.model}; max_seq_len: {args.max_seq_length}; batch_size: {args.batch_size}; " f"pre_epochs: {args.pre_epochs}; num_epochs: {args.num_epochs}; src: {args.src}; tgt: {args.tgt}" ) # eval target encoder on lambda0.1 set of target dataset print("=== Evaluating classifier for encoded target domain ===") print(">>> domain adaption <<<") tgt_acc = evaluate(args, tgt_encoder, src_classifier, tgt_all_loader) f.write(f'{args.src} -> {args.tgt}: DA acc on tgt data: {tgt_acc}\n') f.write( f"model_type: {args.model}; batch_size: {args.batch_size}; pre_epochs: {args.pre_epochs}; " f"num_epochs: {args.num_epochs}; src_free: {args.src_free}; src: {args.src}; " f"tgt: {args.tgt};\n\n")
def main(): # 参数设置 batch_size = 4 device = 'cuda' if torch.cuda.is_available() else 'cpu' epochs = 10 learning_rate = 5e-6 #Learning Rate不宜太大 # 获取到dataset train_dataset = CNewsDataset('data/cnews/cnews.train.txt') valid_dataset = CNewsDataset('data/cnews/cnews.val.txt') #test_data = load_data('cnews/cnews.test.txt') # 生成Batch train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False) #test_dataloader = DataLoader(valid_data, batch_size=batch_size, shuffle=False) # 读取BERT的配置文件 bert_config = BertConfig.from_pretrained('bert-base-chinese') num_labels = len(train_dataset.labels) # 初始化模型 model = BertClassifier(bert_config, num_labels).to(device) optimizer = AdamW(model.parameters(), lr=learning_rate) criterion = nn.CrossEntropyLoss() best_acc = 0 for epoch in range(1, epochs + 1): losses = 0 # 损失 accuracy = 0 # 准确率 model.train() train_bar = tqdm(train_dataloader) for input_ids, token_type_ids, attention_mask, label_id in train_bar: model.zero_grad() train_bar.set_description('Epoch %i train' % epoch) output = model( input_ids=input_ids.to(device), attention_mask=attention_mask.to(device), token_type_ids=token_type_ids.to(device), ) loss = criterion(output, label_id.to(device)) losses += loss.item() pred_labels = torch.argmax(output, dim=1) # 预测出的label acc = torch.sum(pred_labels == label_id.to(device)).item() / len( pred_labels) #acc accuracy += acc loss.backward() optimizer.step() train_bar.set_postfix(loss=loss.item(), acc=acc) average_loss = losses / len(train_dataloader) average_acc = accuracy / len(train_dataloader) print('\tTrain ACC:', average_acc, '\tLoss:', average_loss) # 验证 model.eval() losses = 0 # 损失 accuracy = 0 # 准确率 valid_bar = tqdm(valid_dataloader) for input_ids, token_type_ids, attention_mask, label_id in valid_bar: valid_bar.set_description('Epoch %i valid' % epoch) output = model( input_ids=input_ids.to(device), attention_mask=attention_mask.to(device), token_type_ids=token_type_ids.to(device), ) loss = criterion(output, label_id.to(device)) losses += loss.item() pred_labels = torch.argmax(output, dim=1) # 预测出的label acc = torch.sum(pred_labels == label_id.to(device)).item() / len( pred_labels) #acc accuracy += acc valid_bar.set_postfix(loss=loss.item(), acc=acc) average_loss = losses / len(valid_dataloader) average_acc = accuracy / len(valid_dataloader) print('\tValid ACC:', average_acc, '\tLoss:', average_loss) if average_acc > best_acc: best_acc = average_acc torch.save(model.state_dict(), 'models/best_model.pkl')
class Predictor(object): def __init__(self, config): self.model = None self.config = config self.output_path = config["output_path"] self.vocab_path = os.path.join(config["bert_model_path"], "vocab.txt") self.label_to_index = self.load_vocab() self.index_to_label = {value: key for key, value in self.label_to_index.items()} self.word_vectors = None self.sequence_length = self.config["sequence_length"] # 创建模型 self.create_model() # 加载计算图 self.load_graph() def load_vocab(self): # 将词汇-索引映射表加载出来 with open(os.path.join(self.output_path, "label_to_index.json"), "r") as f: label_to_index = json.load(f) return label_to_index def padding(self, input_id, input_mask, segment_id): """ 对序列进行补全 :param input_id: :param input_mask: :param segment_id: :return: """ if len(input_id) < self.sequence_length: pad_input_id = input_id + [0] * (self.sequence_length - len(input_id)) pad_input_mask = input_mask + [0] * (self.sequence_length - len(input_mask)) pad_segment_id = segment_id + [0] * (self.sequence_length - len(segment_id)) else: pad_input_id = input_id[:self.sequence_length] pad_input_mask = input_mask[:self.sequence_length] pad_segment_id = segment_id[:self.sequence_length] return pad_input_id, pad_input_mask, pad_segment_id def sentence_to_idx(self, text): """ 将分词后的句子转换成idx表示 :return: """ tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_path, do_lower_case=True) text = tokenization.convert_to_unicode(text) tokens = tokenizer.tokenize(text) tokens = ["[CLS]"] + tokens + ["[SEP]"] input_id = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_id) segment_id = [0] * len(input_id) input_id, input_mask, segment_id = self.padding(input_id, input_mask, segment_id) return [input_id], [input_mask], [segment_id] def load_graph(self): """ 加载计算图 :return: """ self.sess = tf.Session() ckpt = tf.train.get_checkpoint_state(self.config["ckpt_model_path"]) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print('Reloading model parameters..') self.model.saver.restore(self.sess, ckpt.model_checkpoint_path) else: raise ValueError('No such file:[{}]'.format(self.config["ckpt_model_path"])) def create_model(self): """ 根据config文件选择对应的模型,并初始化 :return: """ self.model = BertClassifier(config=self.config, is_training=False) def predict(self, text): """ 给定分词后的句子,预测其分类结果 :param text: :return: """ input_ids, input_masks, segment_ids = self.sentence_to_idx(text) prediction = self.model.infer(self.sess, dict(input_ids=input_ids, input_masks=input_masks, segment_ids=segment_ids)).tolist()[0] label = self.index_to_label[prediction] return label
def create_model(self): """ 根据config文件选择对应的模型,并初始化 :return: """ self.model = BertClassifier(config=self.config, is_training=False)
total_correct += (res == gt).sum() total_count += gt.shape[0] total_loss.append(criterion(preds, labels).item()) loss, acc = np.array(total_loss).mean(), total_correct / total_count print("Average Loss: {:.6f}, Accuracy: {:.6f}".format(loss, acc)) return loss, acc device = 'cuda' if torch.cuda.is_available() else 'cpu' epochs = 30 best_acc = 0.0 eval_losses, eval_accs = [], [] train_losses, train_accs = [], [] model = BertClassifier(freeze_bert=False) model = model.to(device) # model = nn.DataParallel(model) train_dataset = EmojiDataset('../../data/train_bert_sentences.npy', '../../data/train_bert_labels.npy') train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn) test_dataset = EmojiDataset('../../data/test_bert_sentences.npy', '../../data/test_bert_labels.npy') test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False,
def main(args, f): # args = parse_arguments() set_seed(args.train_seed) if args.model in ['roberta', 'distilroberta']: tokenizer = RobertaTokenizer.from_pretrained('roberta-base') else: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # preprocess data src_eval_loader, src_loader, tgt_all_loader, tgt_train_loader = get_all_dataloader( args, tokenizer) # load models if args.model == 'bert': encoder = BertEncoder() src_encoder = BertEncoder() classifier = BertClassifier() elif args.model == 'distilbert': encoder = DistilBertEncoder() src_encoder = DistilBertEncoder() classifier = BertClassifier() elif args.model == 'roberta': encoder = RobertaEncoder() src_encoder = RobertaEncoder() classifier = RobertaClassifier() else: encoder = DistilRobertaEncoder() src_encoder = DistilRobertaEncoder() classifier = RobertaClassifier() # domain discriminator discriminator = AdversarialNetworkCdan(param.input_dim * param.num_labels, param.hidden_dim) # parallel models if torch.cuda.device_count() > 1: print('Let\'s use {} GPUs!'.format(torch.cuda.device_count())) encoder = nn.DataParallel(encoder) src_encoder = nn.DataParallel(src_encoder) classifier = nn.DataParallel(classifier) discriminator = nn.DataParallel(discriminator) if args.load: encoder = init_model(args, encoder, restore_path=param.src_encoder_path) src_encoder = init_model(args, src_encoder, restore_path=param.tgt_encoder_path) classifier = init_model(args, classifier, restore_path=param.src_classifier_path) # discriminator = init_model(args, discriminator, restore_path=param.d_model_path) else: encoder = init_model(args, encoder) src_encoder = init_model(args, src_encoder) classifier = init_model(args, classifier) discriminator = init_model(args, discriminator) # train source model print("=== Pretrain encoder for source domain ===") if args.pretrain: encoder, classifier = pretrain(args, encoder, classifier, src_loader) # eval source model print("=== Evaluating classifier for source domain ===") evaluate(args, encoder, classifier, src_loader) src_acc = evaluate(args, encoder, classifier, tgt_all_loader) f.write(f'{args.src} -> {args.tgt} no adapt acc on src data: {src_acc}\n') # x, y = save_features(args, encoder, src_loader) # np.savez(os.path.join(param.model_root, 's_feat_pretrain'), x, y) # x, y = save_features(args, encoder, tgt_all_loader) # np.savez(os.path.join(param.model_root, 't_feat_pretrain'), x, y) # adapt print("=== Adapt encoder for target domain ===") src_encoder.load_state_dict(encoder.state_dict()) if args.src_free: # use the same encoder and copy encoder to src_encoder have different baseline results s_res_features = src_gmm(args, encoder, src_loader) src_loader = s_numpy_dataloader(s_res_features, args.batch_size) encoder, classifier = cdan_adapt_src_free(args, encoder, src_encoder, discriminator, classifier, src_loader, tgt_train_loader, tgt_all_loader) elif args.data_free: s_res_features = src_gmm(args, encoder, src_loader) t_res_features = tgt_gmm(encoder, tgt_all_loader, 1) src_loader = s_numpy_dataloader(s_res_features, args.batch_size) tgt_train_loader = t_numpy_dataloader(t_res_features, args.batch_size) encoder, classifier = cdan_adapt_data_free(args, encoder, discriminator, classifier, src_loader, tgt_train_loader, tgt_all_loader) else: encoder, classifier = cdan_adapt(args, encoder, discriminator, classifier, src_loader, tgt_train_loader, tgt_all_loader) # x, y = save_features(args, encoder, src_loader) # np.savez(os.path.join(param.model_root, 's_feat_adapt_cdan'), x, y) # x, y = save_features(args, encoder, tgt_all_loader) # np.savez(os.path.join(param.model_root, 't_feat_adapt_cdan'), x, y) # argument setting print( f"model_type: {args.model}; batch_size: {args.batch_size}; data_free: {args.data_free}; " f"src_free: {args.src_free}; pre_epochs: {args.pre_epochs}; num_epochs: {args.num_epochs}; " f"src: {args.src}; tgt: {args.tgt}; kd: {args.kd}; dp: {args.dp}; ent: {args.ent}" ) # eval target encoder on lambda0.1 set of target dataset print("=== Evaluating classifier for encoded target domain ===") print(">>> domain adaption <<<") tgt_acc = evaluate(args, encoder, classifier, tgt_all_loader) f.write(f'{args.src} -> {args.tgt}: DA acc on tgt data: {tgt_acc}\n') f.write( f"model_type: {args.model}; batch_size: {args.batch_size}; data_free: {args.data_free}; " f"src_free: {args.src_free}; pre_epochs: {args.pre_epochs}; num_epochs: {args.num_epochs}; " f"src: {args.src}; tgt: {args.tgt}; kd: {args.kd}; dp: {args.dp}; ent: {args.ent}\n\n" )
import torch from model import BertClassifier from transformers import BertTokenizer, BertConfig from train import get_bert_input import pandas as pd labels = ['体育', '娱乐', '家居', '房产', '教育', '时尚', '时政', '游戏', '科技', '财经'] bert_config = BertConfig.from_pretrained('chinese_wwm_pytorch') bert_config.num_labels = len(labels) model = BertClassifier(bert_config) model.load_state_dict( torch.load('./best_model_on_trainset.pkl', map_location=torch.device('cpu'))) tokenizer = BertTokenizer(vocab_file='chinese_wwm_pytorch/vocab.txt') device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu') model = torch.nn.DataParallel(model, device_ids=[2]) model.to(device) def predict_text(text): input_id, attention_mask, token_type_id = get_bert_input(text, tokenizer) input_id = torch.tensor([input_id], dtype=torch.long) attention_mask = torch.tensor([attention_mask], dtype=torch.long) token_type_id = torch.tensor([token_type_id], dtype=torch.long) predicted = model( input_id,
# model = EmbedCosSim(text_field, embedding_dim, use_glove=True, glove_dim=100, # checkpoint_name='checkpoints/embed_cos_sim_glove.pt') # for training model with GloVe # model = RNNClassifier(text_field, embedding_dim, hidden_dim, rnn_type="GRU", bidir=False, # checkpoint_name='checkpoints/gru.pt') # in the above line, you can change rnn_type to either RNN_TANH, GRU, or LSTM to create a different network # you can also set bidir=True to create a bidirectional network # model = CNNClassifier(text_field, embedding_dim, num_filters=32, filter_sizes=[1, 2, 3, 5], # checkpoint_name='checkpoints/cnn.pt') tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased', do_lower=True) train_iter, val_iter, test_iter, text_field, label_field = prep_torch_data( batch_size=32, transformer_tokenize=tokenizer) bert = transformers.BertModel.from_pretrained('bert-base-uncased') for i in bert.parameters(): i.requires_grad = False model = BertClassifier(bert, checkpoint_name='checkpoints/bert.pt') optimizer = optim.Adam(model.parameters()) # move everything to gpu if available device = ("cuda" if torch.cuda.is_available() else "cpu") if device == "cuda": model.cuda() torch.set_default_tensor_type('torch.cuda.FloatTensor') train(model, train_iter, val_iter, test_iter, optimizer, criterion, n_epochs=50,
second_key=args.second_sentence, device="cuda:0") dev_dataset = make_dataset(tokenizer, dev_data, pos_label=args.pos_label, answer_field=args.answer_field, first_key=args.first_sentence, second_key=args.second_sentence, device="cuda:0") train_dataloader = make_dataloader(train_dataset, batch_size=args.train_batch_size) dev_dataloader = make_dataloader(dev_dataset, batch_size=args.dev_batch_size, shuffle=False) if args.batch_size is None: args.batch_size = args.train_batch_size if args.batch_size % args.train_batch_size != 0: raise ValueError("GPU batch size should divide batch size per update.") batches_per_update = args.batch_size // args.train_batch_size bert_classifier = BertClassifier(model, state_key="pooler_output", lr=args.lr, accumulate_gradients=batches_per_update).to("cuda:0") best_score, best_weights = 0.0, None if args.load_file: bert_classifier.load_state_dict(torch.load(args.load_file)) if args.train: model.train() for epoch in range(args.nepochs): progress_bar = tqdm.tqdm(train_dataloader) metrics = initialize_metrics() for i, batch in enumerate(progress_bar, 1): outputs = bert_classifier.train_on_batch(batch) postfix = update_metrics(metrics, outputs, batch["labels"]) progress_bar.set_postfix(postfix) if (args.eval_every_n_batches > 0 and i % args.eval_every_n_batches == 0 and
class Evaluator(object): def __init__(self, args): pretrain_name = 'bert-base-cased' if args.model_info.bert_path: pretrain_name = args.model_info.bert_path print(f"Tokenizer from:{pretrain_name}") train_conf = args.train_info model_conf = args.model_info self.model_type = model_conf.model if self.model_type == 'bert_seq': self.model = BertClassifier(model_conf) self.tokenizer = BertTokenizer.from_pretrained(pretrain_name) self.ds = SentimentDataset if self.model_type == 'GPT2': self.model = GPT2Classifier(model_conf) self.tokenizer = GPT2Tokenizer.from_pretrained(pretrain_name) self.ds = GPT2Dataset self.model.load_state_dict(torch.load(train_conf.model_path)) self.device = train_conf.device self.class_num = model_conf.class_num self.model.to(self.device) self.lr = train_conf.lr self.max_len = train_conf.max_seq_len self.conf = args self.label_map = json.load(open(args.label_map_path)) self.id2label = dict([(i, label_str) for label_str, i in self.label_map.items()]) def run(self, batch_size=64): test_path = self.conf.train_info.test_path test_loader = self.get_data_loader(test_path, batch_size) acc, recall, f1_score, cm, report, res = self.evaluate(test_loader) print(f"Accuracy score of the model is {acc}") print(f"Recall score of the model is {recall}") print(f"F1 score of the model is {f1_score}") print(f"Confusion matrix of the model is {cm}") print(report) dir_ = os.path.dirname(test_path) dir_ = os.path.dirname(dir_) dir_ = os.path.split(dir_)[0] new_path = os.path.join(dir_, 'logs', 'bad_case.json') f = open(new_path, 'w') for i in res: print(json.dumps(i, ensure_ascii=False), file=f) def evaluate(self, _loader): self.model.eval() y_true = list() y_pred = list() res = [] with torch.no_grad(): for batch in _loader: input_ids = batch['input_ids'].to(self.device) attention_mask = batch['attention_mask'].to(self.device) y = batch['labels'] y = torch.squeeze(y, 1) y = y.to(self.device) logits = self.model(input_ids, attention_mask) y_true.append(y) y_pred.append(logits) pred_labels = torch.argmax(logits, dim=1) preds = pred_labels.cpu().numpy() true = batch['labels'].squeeze().numpy() if len(true) < 1: continue for i, c_y in enumerate(true): if c_y != preds[i]: tmp_dict = { 'true_label': self.id2label[c_y], 'pred_label': self.id2label[preds[i]], 'text': batch['text'][i] } res.append(tmp_dict) y_true = torch.cat(y_true) y_pred = torch.cat(y_pred) cm = metrics.cal_cm(y_true, y_pred) acc_score = metrics.cal_accuracy(y_true, y_pred) recall = metrics.cal_recall(y_true, y_pred) f1_score = metrics.cal_f1(y_true, y_pred) label_range = [i for i in range(len(self.label_map))] target_name = [ x[0] for x in sorted(self.label_map.items(), key=lambda x: x[1]) ] report = metrics.get_classification_report(y_true, y_pred, label_range, target_name) return acc_score, recall, f1_score, cm, report, res def get_data_loader(self, f_path, batch_size): np.random.seed(14) texts, labels = prepare(f_path, self.label_map) ds = self.ds(self.tokenizer, texts, labels, self.max_len) return dataloader.DataLoader(ds, batch_size=batch_size, num_workers=self.conf.num_workers, shuffle=True)
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, train_size, test_size = load_corpus( dataset) ''' y_train, y_val, y_test: n*c matrices train_mask, val_mask, test_mask: n-d bool array train_size, test_size: unused ''' # compute number of real train/val/test/word nodes and number of classes nb_node = adj.shape[0] nb_train, nb_val, nb_test = train_mask.sum(), val_mask.sum(), test_mask.sum() nb_word = nb_node - nb_train - nb_val - nb_test nb_class = y_train.shape[1] # instantiate model according to class number model = BertClassifier(pretrained_model=bert_init, nb_class=nb_class) # transform one-hot label to class ID for pytorch computation y = th.LongTensor((y_train + y_val + y_test).argmax(axis=1)) label = {} label['train'], label['val'], label['test'] = y[:nb_train], y[ nb_train:nb_train + nb_val], y[-nb_test:] # load documents and compute input encodings corpus_file = './data/corpus/' + dataset + '_shuffle.txt' with open(corpus_file, 'r') as f: text = f.read() text = text.replace('\\', '') text = text.split('\n')
def main(): args = parse_arguments() # argument setting print("=== Argument Setting ===") print("src: " + args.src) print("tgt: " + args.tgt) print("seed: " + str(args.seed)) print("train_seed: " + str(args.train_seed)) print("model_type: " + str(args.model)) print("max_seq_length: " + str(args.max_seq_length)) print("batch_size: " + str(args.batch_size)) print("pre_epochs: " + str(args.pre_epochs)) print("num_epochs: " + str(args.num_epochs)) print("AD weight: " + str(args.alpha)) print("KD weight: " + str(args.beta)) print("temperature: " + str(args.temperature)) set_seed(args.train_seed) if args.model in ['roberta', 'distilroberta']: tokenizer = RobertaTokenizer.from_pretrained('roberta-base') else: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # preprocess data print("=== Processing datasets ===") if args.src in ['blog', 'airline', 'imdb']: src_x, src_y = CSV2Array( os.path.join('data', args.src, args.src + '.csv')) else: src_x, src_y = XML2Array( os.path.join('data', args.src, 'negative.review'), os.path.join('data', args.src, 'positive.review')) src_x, src_test_x, src_y, src_test_y = train_test_split( src_x, src_y, test_size=0.2, stratify=src_y, random_state=args.seed) if args.tgt in ['blog', 'airline', 'imdb']: tgt_x, tgt_y = CSV2Array( os.path.join('data', args.tgt, args.tgt + '.csv')) else: tgt_x, tgt_y = XML2Array( os.path.join('data', args.tgt, 'negative.review'), os.path.join('data', args.tgt, 'positive.review')) tgt_train_x, tgt_test_y, tgt_train_y, tgt_test_y = train_test_split( tgt_x, tgt_y, test_size=0.2, stratify=tgt_y, random_state=args.seed) if args.model in ['roberta', 'distilroberta']: src_features = roberta_convert_examples_to_features( src_x, src_y, args.max_seq_length, tokenizer) src_test_features = roberta_convert_examples_to_features( src_test_x, src_test_y, args.max_seq_length, tokenizer) tgt_features = roberta_convert_examples_to_features( tgt_x, tgt_y, args.max_seq_length, tokenizer) tgt_train_features = roberta_convert_examples_to_features( tgt_train_x, tgt_train_y, args.max_seq_length, tokenizer) else: src_features = convert_examples_to_features(src_x, src_y, args.max_seq_length, tokenizer) src_test_features = convert_examples_to_features( src_test_x, src_test_y, args.max_seq_length, tokenizer) tgt_features = convert_examples_to_features(tgt_x, tgt_y, args.max_seq_length, tokenizer) tgt_train_features = convert_examples_to_features( tgt_train_x, tgt_train_y, args.max_seq_length, tokenizer) # load dataset src_data_loader = get_data_loader(src_features, args.batch_size) src_data_eval_loader = get_data_loader(src_test_features, args.batch_size) tgt_data_train_loader = get_data_loader(tgt_train_features, args.batch_size) tgt_data_all_loader = get_data_loader(tgt_features, args.batch_size) # load models if args.model == 'bert': src_encoder = BertEncoder() tgt_encoder = BertEncoder() src_classifier = BertClassifier() elif args.model == 'distilbert': src_encoder = DistilBertEncoder() tgt_encoder = DistilBertEncoder() src_classifier = BertClassifier() elif args.model == 'roberta': src_encoder = RobertaEncoder() tgt_encoder = RobertaEncoder() src_classifier = RobertaClassifier() else: src_encoder = DistilRobertaEncoder() tgt_encoder = DistilRobertaEncoder() src_classifier = RobertaClassifier() discriminator = Discriminator() if args.load: src_encoder = init_model(args, src_encoder, restore=param.src_encoder_path) src_classifier = init_model(args, src_classifier, restore=param.src_classifier_path) tgt_encoder = init_model(args, tgt_encoder, restore=param.tgt_encoder_path) discriminator = init_model(args, discriminator, restore=param.d_model_path) else: src_encoder = init_model(args, src_encoder) src_classifier = init_model(args, src_classifier) tgt_encoder = init_model(args, tgt_encoder) discriminator = init_model(args, discriminator) # train source model print("=== Training classifier for source domain ===") if args.pretrain: src_encoder, src_classifier = pretrain(args, src_encoder, src_classifier, src_data_loader) # eval source model print("=== Evaluating classifier for source domain ===") evaluate(src_encoder, src_classifier, src_data_loader) evaluate(src_encoder, src_classifier, src_data_eval_loader) evaluate(src_encoder, src_classifier, tgt_data_all_loader) for params in src_encoder.parameters(): params.requires_grad = False for params in src_classifier.parameters(): params.requires_grad = False # train target encoder by GAN print("=== Training encoder for target domain ===") if args.adapt: tgt_encoder.load_state_dict(src_encoder.state_dict()) tgt_encoder = adapt(args, src_encoder, tgt_encoder, discriminator, src_classifier, src_data_loader, tgt_data_train_loader, tgt_data_all_loader) # eval target encoder on lambda0.1 set of target dataset print("=== Evaluating classifier for encoded target domain ===") print(">>> source only <<<") evaluate(src_encoder, src_classifier, tgt_data_all_loader) print(">>> domain adaption <<<") evaluate(tgt_encoder, src_classifier, tgt_data_all_loader)
def run(): def collate_fn( batch: List[Tuple[torch.LongTensor, torch.LongTensor]], device: torch.device) -> Tuple[torch.LongTensor, torch.LongTensor]: x, y = list(zip(*batch)) x = pad_sequence(x, batch_first=True, padding_value=0) y = torch.stack(y) return x.to(device), y.to(device) df = pd.read_csv("../inputs/Train.csv") # test = pd.read_csv("../inputs/Test.csv") train_df, val_df = train_test_split(df, stratify=df.label, test_size=VALID_SIZE, random_state=SEED) labels = ["Depression", "Alcohol", "Suicide", "Drugs"] train = pd.concat([train_df["text"], pd.get_dummies(train_df['label'])\ .reindex(columns=labels)], axis=1)#.reset_index(drop=True) valid = pd.concat([val_df["text"], pd.get_dummies(val_df['label'])\ .reindex(columns=labels)], axis=1)#.reset_index(drop=True) if DEVICE == 'cpu': print('cpu') else: n_gpu = torch.cuda.device_count() print(torch.cuda.get_device_name(0)) train_dataset = MentalHealthDataset(config.TOKENIZER, train, lazy=True) valid_dataset = MentalHealthDataset(config.TOKENIZER, valid, lazy=True) collate_fn = partial(collate_fn, device=DEVICE) train_sampler = RandomSampler(train_dataset) valid_sampler = RandomSampler(valid_dataset) train_iterator = DataLoader(train_dataset, batch_size=config.TRAIN_BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn) valid_iterator = DataLoader(valid_dataset, batch_size=config.VALID_BATCH_SIZE, sampler=valid_sampler, collate_fn=collate_fn) # model = BertClassifier().to(DEVICE) model = BertClassifier(BertModel.from_pretrained(config.BERT_PATH), 4).to(DEVICE) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] # triangular learning rate, linearly grows untill half of first epoch, then linearly decays warmup_steps = 10**3 # 10 ** 3 total_steps = len(train_iterator) * config.EPOCHS - warmup_steps optimizer = AdamW(optimizer_grouped_parameters, lr=LR, eps=1e-8) scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps) # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps) # optimizer = torch.optim.Adam(model.parameters(), lr=LR) # 1e-4) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", # patience=5, factor=0.3, min_lr=1e-10, verbose=True) for epoch in range(config.EPOCHS): print('=' * 5, f"EPOCH {epoch}", '=' * 5) engine.train_fn(train_iterator, model, optimizer, scheduler) engine.eval_fn(valid_iterator, model) model.eval() test_df = pd.read_csv("../inputs/Test.csv") submission = pd.read_csv('../inputs/SampleSubmission.csv') res = np.zeros((submission.shape[0], len(labels))) for i in tqdm(range(len(test_df) // config.TRAIN_BATCH_SIZE + 1)): batch_df = test_df.iloc[i * config.TRAIN_BATCH_SIZE:(i + 1) * config.TRAIN_BATCH_SIZE] assert (batch_df["ID"] == submission["ID"] [i * config.TRAIN_BATCH_SIZE:(i + 1) * config.TRAIN_BATCH_SIZE]).all(), f"Id mismatch" texts = [] for text in batch_df["text"].tolist(): text = config.TOKENIZER.encode(text, add_special_tokens=True) if len(text) > config.MAX_LEN: text = text[:config.MAX_LEN - 1] + [config.TOKENIZER.sep_token_id] texts.append(torch.LongTensor(text)) x = pad_sequence( texts, batch_first=True, padding_value=config.TOKENIZER.pad_token_id).to(DEVICE) mask = (x != config.TOKENIZER.pad_token_id).float().to(DEVICE) with torch.no_grad(): _, outputs = model(x, attention_mask=mask) outputs = outputs.cpu().numpy() submission.loc[i * config.TRAIN_BATCH_SIZE:(i * config.TRAIN_BATCH_SIZE + len(outputs) - 1), labels] = outputs submission.to_csv("../subs/submission_2.csv", index=False)